modm_data.pdf2html

PDF to HTML Pipeline

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4"""
 5# PDF to HTML Pipeline
 6"""
 7
 8from . import stmicro
 9from .render import render_page_pdf
10from .convert import convert, patch
11from .html import format_document, write_html
12
13from . import ast
14from . import cell
15from . import figure
16from . import line
17from . import page
18from . import table
19
20__all__ = [
21    "stmicro",
22    "render_page_pdf",
23    "convert",
24    "patch",
25    "format_document",
26    "write_html",
27    "ast",
28    "cell",
29    "figure",
30    "line",
31    "page",
32    "table",
33]
def render_page_pdf(doc, page, new_doc=None, index=0):
10def render_page_pdf(doc, page, new_doc=None, index=0):
11    """
12
13
14    :param doc: PDF document
15    :param page: PDF page
16    :param new_doc: Empty PDF document to copy debug renders to
17    """
18    new_doc = pdf_render_page_pdf(doc, page, new_doc, index)
19    # return new_doc
20    new_page = pp.raw.FPDF_LoadPage(new_doc, index)
21    rotation = page.rotation
22    width, height = page.width, page.height
23
24    if False:
25        for ii in range(20):
26            _vline(new_page, rotation, width * ii / 20, 0, height, width=1, stroke="black")
27            _hline(new_page, rotation, height * ii / 20, 0, width, width=1, stroke="black")
28
29    # for name, distance in page._spacing.items():
30    #     if name.startswith("x_"):
31    #         _vline(new_page, rotation, distance, 0, height, width=0.5, stroke=0xFFA500)
32    #     else:
33    #         _hline(new_page, rotation, distance, 0, width, width=0.5, stroke=0xFFA500)
34
35    for name, area in page._areas.items():
36        if isinstance(area, list):
37            for rect in area:
38                _rect(new_page, rotation, rect, width=0.5, stroke=0xFFA500)
39        else:
40            _rect(new_page, rotation, area, width=0.5, stroke=0xFFA500)
41
42    for obj in page.content_graphics:
43        if obj.cbbox is not None:
44            _rect(new_page, rotation, obj.cbbox, width=2, stroke=0x9ACD32)
45        if obj.bbox is not None:
46            _rect(new_page, rotation, obj.bbox, width=2, stroke=0x00FF00)
47
48    for table in page.content_tables:
49        _rect(new_page, rotation, table.bbox, width=1.5, stroke=0x0000FF)
50
51        for lines in table._xgrid.values():
52            for line in lines:
53                _line(new_page, rotation, line, width=0.75, stroke=0x0000FF)
54        for lines in table._ygrid.values():
55            for line in lines:
56                _line(new_page, rotation, line, width=0.75, stroke=0x0000FF)
57
58        for cell in table.cells:
59            for line in cell.lines:
60                for cluster in line.clusters():
61                    _rect(new_page, rotation, cluster.bbox, width=0.33, stroke=0x808080)
62            if cell.b.l:
63                _vline(
64                    new_page, rotation, cell.bbox.left, cell.bbox.bottom, cell.bbox.top, width=cell.b.l, stroke=0xFF0000
65                )
66            if cell.b.r:
67                _vline(
68                    new_page,
69                    rotation,
70                    cell.bbox.right,
71                    cell.bbox.bottom,
72                    cell.bbox.top,
73                    width=cell.b.r,
74                    stroke=0x0000FF,
75                )
76            if cell.b.b:
77                _hline(
78                    new_page,
79                    rotation,
80                    cell.bbox.bottom,
81                    cell.bbox.left,
82                    cell.bbox.right,
83                    width=cell.b.b,
84                    stroke=0x00FF00,
85                )
86            if cell.b.t:
87                _hline(
88                    new_page, rotation, cell.bbox.top, cell.bbox.left, cell.bbox.right, width=cell.b.t, stroke=0x808080
89                )
90
91    assert pp.raw.FPDFPage_GenerateContent(new_page)
92    pp.raw.FPDF_ClosePage(new_page)
93    return new_doc
Parameters
  • doc: PDF document
  • page: PDF page
  • new_doc: Empty PDF document to copy debug renders to
def convert( doc, page_range, output_path, format_chapters=False, pretty=True, render_html=True, render_pdf=False, render_all=False, show_ast=False, show_tree=False, show_tags=False) -> bool:
15def convert(
16    doc,
17    page_range,
18    output_path,
19    format_chapters=False,
20    pretty=True,
21    render_html=True,
22    render_pdf=False,
23    render_all=False,
24    show_ast=False,
25    show_tree=False,
26    show_tags=False,
27) -> bool:
28    document = None
29    debug_doc = None
30    debug_index = 0
31    for page in doc.pages(page_range):
32        if not render_all and not page.is_relevant:
33            continue
34        print(f"\n\n=== {page.top} #{page.number} ===\n")
35
36        if show_tags:
37            for struct in page.structures:
38                print(struct.descr())
39
40        if show_tree or render_html or show_ast:
41            areas = page.content_ast
42            if show_ast:
43                print()
44                for area in areas:
45                    print(RenderTree(area))
46            if show_tree or render_html:
47                for area in areas:
48                    document = merge_area(document, area)
49
50        if render_pdf:
51            debug_doc = render_page_pdf(doc, page, debug_doc, debug_index)
52            debug_index += 1
53
54    if render_pdf:
55        with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle:
56            pp.PdfDocument(debug_doc).save(file_handle)
57
58    if show_tree or render_html:
59        if document is None:
60            print("No pages parsed, empty document!")
61            return True
62
63        document = doc._normalize(document)
64        if show_tree:
65            print(RenderTree(document))
66
67        if render_html:
68            if format_chapters:
69                for chapter in document.children:
70                    if chapter.name == "chapter":
71                        print(f"\nFormatting HTML for '{chapter.title}'")
72                        html = format_document(chapter)
73                        output_file = f"{output_path}/chapter_{chapter._filename}.html"
74                        print(f"\nWriting HTML '{output_file}'")
75                        write_html(html, output_file, pretty=pretty)
76            else:
77                print("\nFormatting HTML")
78                html = format_document(document)
79                print(f"\nWriting HTML '{str(output_path)}'")
80                write_html(html, str(output_path), pretty=pretty)
81
82    return True
def patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
85def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool:
86    if patch_file is None:
87        # First try the patch file for the specific version
88        patch_file = f"{doc.name}.patch"
89        if not pkg_file_exists(data_module, patch_file):
90            # Then try the patch file shared between versions
91            patch_file = f"{doc.name.split('-')[0]}.patch"
92            if not pkg_file_exists(data_module, patch_file):
93                return True
94        return pkg_apply_patch(data_module, patch_file, output_path)
95    return apply_patch(patch_file, output_path)
def format_document(document):
261def format_document(document):
262    html = etree.Element("html")
263
264    head = etree.Element("head")
265    html.append(head)
266
267    link = etree.Element("link")
268    link.set("rel", "stylesheet")
269    link.set("href", "../style.css")
270    head.append(link)
271
272    body = etree.Element("body")
273    html.append(body)
274
275    _format_html(body, document, with_newlines=True)
276
277    html = etree.ElementTree(html)
278    return html
def write_html(html, path, pretty=True):
281def write_html(html, path, pretty=True):
282    with open(path, "wb") as f:
283        html.write(f, pretty_print=pretty, doctype="<!DOCTYPE html>")