modm_data.pdf2html

PDF to HTML Pipeline

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4"""
 5# PDF to HTML Pipeline
 6"""
 7
 8from .render import annotate_debug_info
 9from .convert import convert, patch
10from .html import format_document, write_html
11
12__all__ = [
13    "stmicro",
14    "ti",
15    "convert",
16    "annotate_debug_info",
17    "format_document",
18    "write_html",
19    "patch",
20    "ast",
21    "cell",
22    "figure",
23    "line",
24    "page",
25    "table",
26]
ti
def convert( doc: pypdfium2._helpers.document.PdfDocument, page_range: Iterable[int], output_path: pathlib.Path, format_chapters: bool = False, pretty: bool = True, render_html: bool = True, render_pdf: bool = False, render_all: bool = False, show_ast: bool = False, show_tree: bool = False, show_tags: bool = False) -> bool:
16def convert(
17    doc: pp.PdfDocument,
18    page_range: Iterable[int],
19    output_path: Path,
20    format_chapters: bool = False,
21    pretty: bool = True,
22    render_html: bool = True,
23    render_pdf: bool = False,
24    render_all: bool = False,
25    show_ast: bool = False,
26    show_tree: bool = False,
27    show_tags: bool = False,
28) -> bool:
29    document = None
30    debug_doc = None
31    debug_index = 0
32    for page in doc.pages(page_range):
33        if not render_all and not page.is_relevant:
34            continue
35        print(f"\n\n=== {page.top} #{page.number} ===\n")
36
37        if show_tags:
38            for struct in page.structures:
39                print(struct.descr())
40
41        if show_tree or render_html or show_ast:
42            areas = page.content_ast
43            if show_ast:
44                print()
45                for area in areas:
46                    print(RenderTree(area))
47            if show_tree or render_html:
48                for area in areas:
49                    document = merge_area(document, area)
50
51        if render_pdf:
52            debug_doc = annotate_debug_info(page, debug_doc, debug_index)
53            debug_index += 1
54
55    if render_pdf:
56        with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle:
57            pp.PdfDocument(debug_doc).save(file_handle)
58
59    if show_tree or render_html:
60        if document is None:
61            print("No pages parsed, empty document!")
62            return True
63
64        document = doc._normalize(document)
65        if show_tree:
66            print(RenderTree(document))
67
68        if render_html:
69            if format_chapters:
70                for chapter in document.children:
71                    if chapter.name == "chapter":
72                        print(f"\nFormatting HTML for '{chapter.title}'")
73                        html = format_document(chapter)
74                        output_file = f"{output_path}/chapter_{chapter._filename}.html"
75                        print(f"\nWriting HTML '{output_file}'")
76                        write_html(html, output_file, pretty=pretty)
77            else:
78                print("\nFormatting HTML")
79                html = format_document(document)
80                print(f"\nWriting HTML '{str(output_path)}'")
81                write_html(html, str(output_path), pretty=pretty)
82
83    return True
def annotate_debug_info( page: modm_data.pdf2html.page.Page, new_doc: pypdfium2._helpers.document.PdfDocument = None, index: int = 0) -> pypdfium2._helpers.document.PdfDocument:
 11def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument:
 12    """
 13    Copies each page into a new or existing PDF document and overlays the internal information on top of the content.
 14    In addition to the information overlayed in `modm_data.pdf.annotate_debug_info`, this function:
 15    - renders all content areas in ORANGE.
 16    - renders all graphic cluster in content areas in GREEN.
 17    - renders all tables in content areas in BLUE.
 18
 19    :param page: The page to be annotated.
 20    :param new_doc: The PDF document to copy the page to. If not provided, a new document is created.
 21    :param index: The index of the page in the new document.
 22    :return: The new document with the annotated page added.
 23    """
 24    new_doc = pdf_annotate_debug_info(page, new_doc, index)
 25    # return new_doc
 26    new_page = pp.raw.FPDF_LoadPage(new_doc, index)
 27    rotation = page.rotation
 28    width, height = page.width, page.height
 29
 30    if False:
 31        for ii in range(20):
 32            _vline(new_page, rotation, width * ii / 20, 0, height, width=1, stroke="black")
 33            _hline(new_page, rotation, height * ii / 20, 0, width, width=1, stroke="black")
 34
 35    # for name, distance in page._spacing.items():
 36    #     if name.startswith("x_"):
 37    #         _vline(new_page, rotation, distance, 0, height, width=0.5, stroke=0xFFA500)
 38    #     else:
 39    #         _hline(new_page, rotation, distance, 0, width, width=0.5, stroke=0xFFA500)
 40
 41    for name, area in page._areas.items():
 42        if isinstance(area, list):
 43            for rect in area:
 44                _rect(new_page, rotation, rect, width=0.5, stroke=0xFFA500)
 45        else:
 46            _rect(new_page, rotation, area, width=0.5, stroke=0xFFA500)
 47
 48    for obj in page.content_graphics:
 49        if obj.cbbox is not None:
 50            _rect(new_page, rotation, obj.cbbox, width=2, stroke=0x9ACD32)
 51        if obj.bbox is not None:
 52            _rect(new_page, rotation, obj.bbox, width=2, stroke=0x00FF00)
 53
 54    for table in page.content_tables:
 55        _rect(new_page, rotation, table.bbox, width=1.5, stroke=0x0000FF)
 56
 57        for lines in table._xgrid.values():
 58            for line in lines:
 59                _line(new_page, rotation, line, width=0.75, stroke=0x0000FF)
 60        for lines in table._ygrid.values():
 61            for line in lines:
 62                _line(new_page, rotation, line, width=0.75, stroke=0x0000FF)
 63
 64        for cell in table.cells:
 65            for line in cell.lines:
 66                for cluster in line.clusters():
 67                    _rect(new_page, rotation, cluster.bbox, width=0.33, stroke=0x808080)
 68            if cell.borders.left:
 69                _vline(
 70                    new_page,
 71                    rotation,
 72                    cell.bbox.left,
 73                    cell.bbox.bottom,
 74                    cell.bbox.top,
 75                    width=cell.borders.left,
 76                    stroke=0xFF0000,
 77                )
 78            if cell.borders.right:
 79                _vline(
 80                    new_page,
 81                    rotation,
 82                    cell.bbox.right,
 83                    cell.bbox.bottom,
 84                    cell.bbox.top,
 85                    width=cell.borders.right,
 86                    stroke=0x0000FF,
 87                )
 88            if cell.borders.bottom:
 89                _hline(
 90                    new_page,
 91                    rotation,
 92                    cell.bbox.bottom,
 93                    cell.bbox.left,
 94                    cell.bbox.right,
 95                    width=cell.borders.bottom,
 96                    stroke=0x00FF00,
 97                )
 98            if cell.borders.top:
 99                _hline(
100                    new_page,
101                    rotation,
102                    cell.bbox.top,
103                    cell.bbox.left,
104                    cell.bbox.right,
105                    width=cell.borders.top,
106                    stroke=0x808080,
107                )
108
109    assert pp.raw.FPDFPage_GenerateContent(new_page)
110    pp.raw.FPDF_ClosePage(new_page)
111    return new_doc

Copies each page into a new or existing PDF document and overlays the internal information on top of the content. In addition to the information overlayed in modm_data.pdf.annotate_debug_info, this function:

  • renders all content areas in ORANGE.
  • renders all graphic cluster in content areas in GREEN.
  • renders all tables in content areas in BLUE.
Parameters
  • page: The page to be annotated.
  • new_doc: The PDF document to copy the page to. If not provided, a new document is created.
  • index: The index of the page in the new document.
Returns

The new document with the annotated page added.

def format_document(document):
261def format_document(document):
262    html = etree.Element("html")
263
264    head = etree.Element("head")
265    html.append(head)
266
267    link = etree.Element("link")
268    link.set("rel", "stylesheet")
269    link.set("href", "../style.css")
270    head.append(link)
271
272    body = etree.Element("body")
273    html.append(body)
274
275    _format_html(body, document, with_newlines=True)
276
277    html = etree.ElementTree(html)
278    return html
def write_html(html, path, pretty=True):
281def write_html(html, path, pretty=True):
282    with open(path, "wb") as f:
283        html.write(f, pretty_print=pretty, doctype="<!DOCTYPE html>")
def patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
86def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool:
87    if patch_file is None:
88        # First try the patch file for the specific version
89        patch_file = f"{doc.name}.patch"
90        if not pkg_file_exists(data_module, patch_file):
91            # Then try the patch file shared between versions
92            patch_file = f"{doc.name.split('-')[0]}.patch"
93            if not pkg_file_exists(data_module, patch_file):
94                return True
95        return pkg_apply_patch(data_module, patch_file, output_path)
96    return apply_patch(patch_file, output_path)