modm_data.pdf2html.convert

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4from anytree import RenderTree
 5
 6from .html import format_document, write_html
 7from .render import render_page_pdf
 8from ..utils import pkg_apply_patch, pkg_file_exists, apply_patch
 9from .ast import merge_area
10from pathlib import Path
11import pypdfium2 as pp
12
13
14def convert(
15    doc,
16    page_range,
17    output_path,
18    format_chapters=False,
19    pretty=True,
20    render_html=True,
21    render_pdf=False,
22    render_all=False,
23    show_ast=False,
24    show_tree=False,
25    show_tags=False,
26) -> bool:
27    document = None
28    debug_doc = None
29    debug_index = 0
30    for page in doc.pages(page_range):
31        if not render_all and not page.is_relevant:
32            continue
33        print(f"\n\n=== {page.top} #{page.number} ===\n")
34
35        if show_tags:
36            for struct in page.structures:
37                print(struct.descr())
38
39        if show_tree or render_html or show_ast:
40            areas = page.content_ast
41            if show_ast:
42                print()
43                for area in areas:
44                    print(RenderTree(area))
45            if show_tree or render_html:
46                for area in areas:
47                    document = merge_area(document, area)
48
49        if render_pdf:
50            debug_doc = render_page_pdf(doc, page, debug_doc, debug_index)
51            debug_index += 1
52
53    if render_pdf:
54        with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle:
55            pp.PdfDocument(debug_doc).save(file_handle)
56
57    if show_tree or render_html:
58        if document is None:
59            print("No pages parsed, empty document!")
60            return True
61
62        document = doc._normalize(document)
63        if show_tree:
64            print(RenderTree(document))
65
66        if render_html:
67            if format_chapters:
68                for chapter in document.children:
69                    if chapter.name == "chapter":
70                        print(f"\nFormatting HTML for '{chapter.title}'")
71                        html = format_document(chapter)
72                        output_file = f"{output_path}/chapter_{chapter._filename}.html"
73                        print(f"\nWriting HTML '{output_file}'")
74                        write_html(html, output_file, pretty=pretty)
75            else:
76                print("\nFormatting HTML")
77                html = format_document(document)
78                print(f"\nWriting HTML '{str(output_path)}'")
79                write_html(html, str(output_path), pretty=pretty)
80
81    return True
82
83
84def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool:
85    if patch_file is None:
86        # First try the patch file for the specific version
87        patch_file = f"{doc.name}.patch"
88        if not pkg_file_exists(data_module, patch_file):
89            # Then try the patch file shared between versions
90            patch_file = f"{doc.name.split('-')[0]}.patch"
91            if not pkg_file_exists(data_module, patch_file):
92                return True
93        return pkg_apply_patch(data_module, patch_file, output_path)
94    return apply_patch(patch_file, output_path)
def convert( doc, page_range, output_path, format_chapters=False, pretty=True, render_html=True, render_pdf=False, render_all=False, show_ast=False, show_tree=False, show_tags=False) -> bool:
15def convert(
16    doc,
17    page_range,
18    output_path,
19    format_chapters=False,
20    pretty=True,
21    render_html=True,
22    render_pdf=False,
23    render_all=False,
24    show_ast=False,
25    show_tree=False,
26    show_tags=False,
27) -> bool:
28    document = None
29    debug_doc = None
30    debug_index = 0
31    for page in doc.pages(page_range):
32        if not render_all and not page.is_relevant:
33            continue
34        print(f"\n\n=== {page.top} #{page.number} ===\n")
35
36        if show_tags:
37            for struct in page.structures:
38                print(struct.descr())
39
40        if show_tree or render_html or show_ast:
41            areas = page.content_ast
42            if show_ast:
43                print()
44                for area in areas:
45                    print(RenderTree(area))
46            if show_tree or render_html:
47                for area in areas:
48                    document = merge_area(document, area)
49
50        if render_pdf:
51            debug_doc = render_page_pdf(doc, page, debug_doc, debug_index)
52            debug_index += 1
53
54    if render_pdf:
55        with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle:
56            pp.PdfDocument(debug_doc).save(file_handle)
57
58    if show_tree or render_html:
59        if document is None:
60            print("No pages parsed, empty document!")
61            return True
62
63        document = doc._normalize(document)
64        if show_tree:
65            print(RenderTree(document))
66
67        if render_html:
68            if format_chapters:
69                for chapter in document.children:
70                    if chapter.name == "chapter":
71                        print(f"\nFormatting HTML for '{chapter.title}'")
72                        html = format_document(chapter)
73                        output_file = f"{output_path}/chapter_{chapter._filename}.html"
74                        print(f"\nWriting HTML '{output_file}'")
75                        write_html(html, output_file, pretty=pretty)
76            else:
77                print("\nFormatting HTML")
78                html = format_document(document)
79                print(f"\nWriting HTML '{str(output_path)}'")
80                write_html(html, str(output_path), pretty=pretty)
81
82    return True
def patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
85def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool:
86    if patch_file is None:
87        # First try the patch file for the specific version
88        patch_file = f"{doc.name}.patch"
89        if not pkg_file_exists(data_module, patch_file):
90            # Then try the patch file shared between versions
91            patch_file = f"{doc.name.split('-')[0]}.patch"
92            if not pkg_file_exists(data_module, patch_file):
93                return True
94        return pkg_apply_patch(data_module, patch_file, output_path)
95    return apply_patch(patch_file, output_path)