modm_data.pdf2html
PDF to HTML Pipeline
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF to HTML Pipeline 6""" 7 8from . import stmicro 9from .render import render_page_pdf 10from .convert import convert, patch 11from .html import format_document, write_html 12 13from . import ast 14from . import cell 15from . import figure 16from . import line 17from . import page 18from . import table 19 20__all__ = [ 21 "stmicro", 22 "render_page_pdf", 23 "convert", 24 "patch", 25 "format_document", 26 "write_html", 27 "ast", 28 "cell", 29 "figure", 30 "line", 31 "page", 32 "table", 33]
def
render_page_pdf(doc, page, new_doc=None, index=0):
10def render_page_pdf(doc, page, new_doc=None, index=0): 11 """ 12 13 14 :param doc: PDF document 15 :param page: PDF page 16 :param new_doc: Empty PDF document to copy debug renders to 17 """ 18 new_doc = pdf_render_page_pdf(doc, page, new_doc, index) 19 # return new_doc 20 new_page = pp.raw.FPDF_LoadPage(new_doc, index) 21 rotation = page.rotation 22 width, height = page.width, page.height 23 24 if False: 25 for ii in range(20): 26 _vline(new_page, rotation, width * ii / 20, 0, height, width=1, stroke="black") 27 _hline(new_page, rotation, height * ii / 20, 0, width, width=1, stroke="black") 28 29 # for name, distance in page._spacing.items(): 30 # if name.startswith("x_"): 31 # _vline(new_page, rotation, distance, 0, height, width=0.5, stroke=0xFFA500) 32 # else: 33 # _hline(new_page, rotation, distance, 0, width, width=0.5, stroke=0xFFA500) 34 35 for name, area in page._areas.items(): 36 if isinstance(area, list): 37 for rect in area: 38 _rect(new_page, rotation, rect, width=0.5, stroke=0xFFA500) 39 else: 40 _rect(new_page, rotation, area, width=0.5, stroke=0xFFA500) 41 42 for obj in page.content_graphics: 43 if obj.cbbox is not None: 44 _rect(new_page, rotation, obj.cbbox, width=2, stroke=0x9ACD32) 45 if obj.bbox is not None: 46 _rect(new_page, rotation, obj.bbox, width=2, stroke=0x00FF00) 47 48 for table in page.content_tables: 49 _rect(new_page, rotation, table.bbox, width=1.5, stroke=0x0000FF) 50 51 for lines in table._xgrid.values(): 52 for line in lines: 53 _line(new_page, rotation, line, width=0.75, stroke=0x0000FF) 54 for lines in table._ygrid.values(): 55 for line in lines: 56 _line(new_page, rotation, line, width=0.75, stroke=0x0000FF) 57 58 for cell in table.cells: 59 for line in cell.lines: 60 for cluster in line.clusters(): 61 _rect(new_page, rotation, cluster.bbox, width=0.33, stroke=0x808080) 62 if cell.b.l: 63 _vline( 64 new_page, rotation, cell.bbox.left, cell.bbox.bottom, cell.bbox.top, width=cell.b.l, stroke=0xFF0000 65 ) 66 if cell.b.r: 67 _vline( 68 new_page, 69 rotation, 70 cell.bbox.right, 71 cell.bbox.bottom, 72 cell.bbox.top, 73 width=cell.b.r, 74 stroke=0x0000FF, 75 ) 76 if cell.b.b: 77 _hline( 78 new_page, 79 rotation, 80 cell.bbox.bottom, 81 cell.bbox.left, 82 cell.bbox.right, 83 width=cell.b.b, 84 stroke=0x00FF00, 85 ) 86 if cell.b.t: 87 _hline( 88 new_page, rotation, cell.bbox.top, cell.bbox.left, cell.bbox.right, width=cell.b.t, stroke=0x808080 89 ) 90 91 assert pp.raw.FPDFPage_GenerateContent(new_page) 92 pp.raw.FPDF_ClosePage(new_page) 93 return new_doc
Parameters
- doc: PDF document
- page: PDF page
- new_doc: Empty PDF document to copy debug renders to
def
convert( doc, page_range, output_path, format_chapters=False, pretty=True, render_html=True, render_pdf=False, render_all=False, show_ast=False, show_tree=False, show_tags=False) -> bool:
15def convert( 16 doc, 17 page_range, 18 output_path, 19 format_chapters=False, 20 pretty=True, 21 render_html=True, 22 render_pdf=False, 23 render_all=False, 24 show_ast=False, 25 show_tree=False, 26 show_tags=False, 27) -> bool: 28 document = None 29 debug_doc = None 30 debug_index = 0 31 for page in doc.pages(page_range): 32 if not render_all and not page.is_relevant: 33 continue 34 print(f"\n\n=== {page.top} #{page.number} ===\n") 35 36 if show_tags: 37 for struct in page.structures: 38 print(struct.descr()) 39 40 if show_tree or render_html or show_ast: 41 areas = page.content_ast 42 if show_ast: 43 print() 44 for area in areas: 45 print(RenderTree(area)) 46 if show_tree or render_html: 47 for area in areas: 48 document = merge_area(document, area) 49 50 if render_pdf: 51 debug_doc = render_page_pdf(doc, page, debug_doc, debug_index) 52 debug_index += 1 53 54 if render_pdf: 55 with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle: 56 pp.PdfDocument(debug_doc).save(file_handle) 57 58 if show_tree or render_html: 59 if document is None: 60 print("No pages parsed, empty document!") 61 return True 62 63 document = doc._normalize(document) 64 if show_tree: 65 print(RenderTree(document)) 66 67 if render_html: 68 if format_chapters: 69 for chapter in document.children: 70 if chapter.name == "chapter": 71 print(f"\nFormatting HTML for '{chapter.title}'") 72 html = format_document(chapter) 73 output_file = f"{output_path}/chapter_{chapter._filename}.html" 74 print(f"\nWriting HTML '{output_file}'") 75 write_html(html, output_file, pretty=pretty) 76 else: 77 print("\nFormatting HTML") 78 html = format_document(document) 79 print(f"\nWriting HTML '{str(output_path)}'") 80 write_html(html, str(output_path), pretty=pretty) 81 82 return True
def
patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
85def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool: 86 if patch_file is None: 87 # First try the patch file for the specific version 88 patch_file = f"{doc.name}.patch" 89 if not pkg_file_exists(data_module, patch_file): 90 # Then try the patch file shared between versions 91 patch_file = f"{doc.name.split('-')[0]}.patch" 92 if not pkg_file_exists(data_module, patch_file): 93 return True 94 return pkg_apply_patch(data_module, patch_file, output_path) 95 return apply_patch(patch_file, output_path)
def
format_document(document):
261def format_document(document): 262 html = etree.Element("html") 263 264 head = etree.Element("head") 265 html.append(head) 266 267 link = etree.Element("link") 268 link.set("rel", "stylesheet") 269 link.set("href", "../style.css") 270 head.append(link) 271 272 body = etree.Element("body") 273 html.append(body) 274 275 _format_html(body, document, with_newlines=True) 276 277 html = etree.ElementTree(html) 278 return html
def
write_html(html, path, pretty=True):