modm_data.pdf2html.convert

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4from anytree import RenderTree
 5from typing import Iterable
 6
 7from .html import format_document, write_html
 8from .render import annotate_debug_info
 9from ..utils import pkg_apply_patch, pkg_file_exists, apply_patch
10from .ast import merge_area
11from pathlib import Path
12import pypdfium2 as pp
13
14
15def convert(
16    doc: pp.PdfDocument,
17    page_range: Iterable[int],
18    output_path: Path,
19    format_chapters: bool = False,
20    pretty: bool = True,
21    render_html: bool = True,
22    render_pdf: bool = False,
23    render_all: bool = False,
24    show_ast: bool = False,
25    show_tree: bool = False,
26    show_tags: bool = False,
27) -> bool:
28    document = None
29    debug_doc = None
30    debug_index = 0
31    for page in doc.pages(page_range):
32        if not render_all and not page.is_relevant:
33            continue
34        print(f"\n\n=== {page.top} #{page.number} ===\n")
35
36        if show_tags:
37            for struct in page.structures:
38                print(struct.descr())
39
40        if show_tree or render_html or show_ast:
41            areas = page.content_ast
42            if show_ast:
43                print()
44                for area in areas:
45                    print(RenderTree(area))
46            if show_tree or render_html:
47                for area in areas:
48                    document = merge_area(document, area)
49
50        if render_pdf:
51            debug_doc = annotate_debug_info(page, debug_doc, debug_index)
52            debug_index += 1
53
54    if render_pdf:
55        with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle:
56            pp.PdfDocument(debug_doc).save(file_handle)
57
58    if show_tree or render_html:
59        if document is None:
60            print("No pages parsed, empty document!")
61            return True
62
63        document = doc._normalize(document)
64        if show_tree:
65            print(RenderTree(document))
66
67        if render_html:
68            if format_chapters:
69                for chapter in document.children:
70                    if chapter.name == "chapter":
71                        print(f"\nFormatting HTML for '{chapter.title}'")
72                        html = format_document(chapter)
73                        output_file = f"{output_path}/chapter_{chapter._filename}.html"
74                        print(f"\nWriting HTML '{output_file}'")
75                        write_html(html, output_file, pretty=pretty)
76            else:
77                print("\nFormatting HTML")
78                html = format_document(document)
79                print(f"\nWriting HTML '{str(output_path)}'")
80                write_html(html, str(output_path), pretty=pretty)
81
82    return True
83
84
85def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool:
86    if patch_file is None:
87        # First try the patch file for the specific version
88        patch_file = f"{doc.name}.patch"
89        if not pkg_file_exists(data_module, patch_file):
90            # Then try the patch file shared between versions
91            patch_file = f"{doc.name.split('-')[0]}.patch"
92            if not pkg_file_exists(data_module, patch_file):
93                return True
94        return pkg_apply_patch(data_module, patch_file, output_path)
95    return apply_patch(patch_file, output_path)
def convert( doc: pypdfium2._helpers.document.PdfDocument, page_range: Iterable[int], output_path: pathlib.Path, format_chapters: bool = False, pretty: bool = True, render_html: bool = True, render_pdf: bool = False, render_all: bool = False, show_ast: bool = False, show_tree: bool = False, show_tags: bool = False) -> bool:
16def convert(
17    doc: pp.PdfDocument,
18    page_range: Iterable[int],
19    output_path: Path,
20    format_chapters: bool = False,
21    pretty: bool = True,
22    render_html: bool = True,
23    render_pdf: bool = False,
24    render_all: bool = False,
25    show_ast: bool = False,
26    show_tree: bool = False,
27    show_tags: bool = False,
28) -> bool:
29    document = None
30    debug_doc = None
31    debug_index = 0
32    for page in doc.pages(page_range):
33        if not render_all and not page.is_relevant:
34            continue
35        print(f"\n\n=== {page.top} #{page.number} ===\n")
36
37        if show_tags:
38            for struct in page.structures:
39                print(struct.descr())
40
41        if show_tree or render_html or show_ast:
42            areas = page.content_ast
43            if show_ast:
44                print()
45                for area in areas:
46                    print(RenderTree(area))
47            if show_tree or render_html:
48                for area in areas:
49                    document = merge_area(document, area)
50
51        if render_pdf:
52            debug_doc = annotate_debug_info(page, debug_doc, debug_index)
53            debug_index += 1
54
55    if render_pdf:
56        with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle:
57            pp.PdfDocument(debug_doc).save(file_handle)
58
59    if show_tree or render_html:
60        if document is None:
61            print("No pages parsed, empty document!")
62            return True
63
64        document = doc._normalize(document)
65        if show_tree:
66            print(RenderTree(document))
67
68        if render_html:
69            if format_chapters:
70                for chapter in document.children:
71                    if chapter.name == "chapter":
72                        print(f"\nFormatting HTML for '{chapter.title}'")
73                        html = format_document(chapter)
74                        output_file = f"{output_path}/chapter_{chapter._filename}.html"
75                        print(f"\nWriting HTML '{output_file}'")
76                        write_html(html, output_file, pretty=pretty)
77            else:
78                print("\nFormatting HTML")
79                html = format_document(document)
80                print(f"\nWriting HTML '{str(output_path)}'")
81                write_html(html, str(output_path), pretty=pretty)
82
83    return True
def patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
86def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool:
87    if patch_file is None:
88        # First try the patch file for the specific version
89        patch_file = f"{doc.name}.patch"
90        if not pkg_file_exists(data_module, patch_file):
91            # Then try the patch file shared between versions
92            patch_file = f"{doc.name.split('-')[0]}.patch"
93            if not pkg_file_exists(data_module, patch_file):
94                return True
95        return pkg_apply_patch(data_module, patch_file, output_path)
96    return apply_patch(patch_file, output_path)