modm_data.pdf2html
PDF to HTML Pipeline
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF to HTML Pipeline 6""" 7 8from .render import annotate_debug_info 9from .convert import convert, patch 10from .html import format_document, write_html 11 12__all__ = [ 13 "stmicro", 14 "ti", 15 "convert", 16 "annotate_debug_info", 17 "format_document", 18 "write_html", 19 "patch", 20 "ast", 21 "cell", 22 "figure", 23 "line", 24 "page", 25 "table", 26]
ti
def
convert( doc: pypdfium2._helpers.document.PdfDocument, page_range: Iterable[int], output_path: pathlib.Path, format_chapters: bool = False, pretty: bool = True, render_html: bool = True, render_pdf: bool = False, render_all: bool = False, show_ast: bool = False, show_tree: bool = False, show_tags: bool = False) -> bool:
16def convert( 17 doc: pp.PdfDocument, 18 page_range: Iterable[int], 19 output_path: Path, 20 format_chapters: bool = False, 21 pretty: bool = True, 22 render_html: bool = True, 23 render_pdf: bool = False, 24 render_all: bool = False, 25 show_ast: bool = False, 26 show_tree: bool = False, 27 show_tags: bool = False, 28) -> bool: 29 document = None 30 debug_doc = None 31 debug_index = 0 32 for page in doc.pages(page_range): 33 if not render_all and not page.is_relevant: 34 continue 35 print(f"\n\n=== {page.top} #{page.number} ===\n") 36 37 if show_tags: 38 for struct in page.structures: 39 print(struct.descr()) 40 41 if show_tree or render_html or show_ast: 42 areas = page.content_ast 43 if show_ast: 44 print() 45 for area in areas: 46 print(RenderTree(area)) 47 if show_tree or render_html: 48 for area in areas: 49 document = merge_area(document, area) 50 51 if render_pdf: 52 debug_doc = annotate_debug_info(page, debug_doc, debug_index) 53 debug_index += 1 54 55 if render_pdf: 56 with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle: 57 pp.PdfDocument(debug_doc).save(file_handle) 58 59 if show_tree or render_html: 60 if document is None: 61 print("No pages parsed, empty document!") 62 return True 63 64 document = doc._normalize(document) 65 if show_tree: 66 print(RenderTree(document)) 67 68 if render_html: 69 if format_chapters: 70 for chapter in document.children: 71 if chapter.name == "chapter": 72 print(f"\nFormatting HTML for '{chapter.title}'") 73 html = format_document(chapter) 74 output_file = f"{output_path}/chapter_{chapter._filename}.html" 75 print(f"\nWriting HTML '{output_file}'") 76 write_html(html, output_file, pretty=pretty) 77 else: 78 print("\nFormatting HTML") 79 html = format_document(document) 80 print(f"\nWriting HTML '{str(output_path)}'") 81 write_html(html, str(output_path), pretty=pretty) 82 83 return True
def
annotate_debug_info( page: modm_data.pdf2html.page.Page, new_doc: pypdfium2._helpers.document.PdfDocument = None, index: int = 0) -> pypdfium2._helpers.document.PdfDocument:
11def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument: 12 """ 13 Copies each page into a new or existing PDF document and overlays the internal information on top of the content. 14 In addition to the information overlayed in `modm_data.pdf.annotate_debug_info`, this function: 15 - renders all content areas in ORANGE. 16 - renders all graphic cluster in content areas in GREEN. 17 - renders all tables in content areas in BLUE. 18 19 :param page: The page to be annotated. 20 :param new_doc: The PDF document to copy the page to. If not provided, a new document is created. 21 :param index: The index of the page in the new document. 22 :return: The new document with the annotated page added. 23 """ 24 new_doc = pdf_annotate_debug_info(page, new_doc, index) 25 # return new_doc 26 new_page = pp.raw.FPDF_LoadPage(new_doc, index) 27 rotation = page.rotation 28 width, height = page.width, page.height 29 30 if False: 31 for ii in range(20): 32 _vline(new_page, rotation, width * ii / 20, 0, height, width=1, stroke="black") 33 _hline(new_page, rotation, height * ii / 20, 0, width, width=1, stroke="black") 34 35 # for name, distance in page._spacing.items(): 36 # if name.startswith("x_"): 37 # _vline(new_page, rotation, distance, 0, height, width=0.5, stroke=0xFFA500) 38 # else: 39 # _hline(new_page, rotation, distance, 0, width, width=0.5, stroke=0xFFA500) 40 41 for name, area in page._areas.items(): 42 if isinstance(area, list): 43 for rect in area: 44 _rect(new_page, rotation, rect, width=0.5, stroke=0xFFA500) 45 else: 46 _rect(new_page, rotation, area, width=0.5, stroke=0xFFA500) 47 48 for obj in page.content_graphics: 49 if obj.cbbox is not None: 50 _rect(new_page, rotation, obj.cbbox, width=2, stroke=0x9ACD32) 51 if obj.bbox is not None: 52 _rect(new_page, rotation, obj.bbox, width=2, stroke=0x00FF00) 53 54 for table in page.content_tables: 55 _rect(new_page, rotation, table.bbox, width=1.5, stroke=0x0000FF) 56 57 for lines in table._xgrid.values(): 58 for line in lines: 59 _line(new_page, rotation, line, width=0.75, stroke=0x0000FF) 60 for lines in table._ygrid.values(): 61 for line in lines: 62 _line(new_page, rotation, line, width=0.75, stroke=0x0000FF) 63 64 for cell in table.cells: 65 for line in cell.lines: 66 for cluster in line.clusters(): 67 _rect(new_page, rotation, cluster.bbox, width=0.33, stroke=0x808080) 68 if cell.borders.left: 69 _vline( 70 new_page, 71 rotation, 72 cell.bbox.left, 73 cell.bbox.bottom, 74 cell.bbox.top, 75 width=cell.borders.left, 76 stroke=0xFF0000, 77 ) 78 if cell.borders.right: 79 _vline( 80 new_page, 81 rotation, 82 cell.bbox.right, 83 cell.bbox.bottom, 84 cell.bbox.top, 85 width=cell.borders.right, 86 stroke=0x0000FF, 87 ) 88 if cell.borders.bottom: 89 _hline( 90 new_page, 91 rotation, 92 cell.bbox.bottom, 93 cell.bbox.left, 94 cell.bbox.right, 95 width=cell.borders.bottom, 96 stroke=0x00FF00, 97 ) 98 if cell.borders.top: 99 _hline( 100 new_page, 101 rotation, 102 cell.bbox.top, 103 cell.bbox.left, 104 cell.bbox.right, 105 width=cell.borders.top, 106 stroke=0x808080, 107 ) 108 109 assert pp.raw.FPDFPage_GenerateContent(new_page) 110 pp.raw.FPDF_ClosePage(new_page) 111 return new_doc
Copies each page into a new or existing PDF document and overlays the internal information on top of the content.
In addition to the information overlayed in modm_data.pdf.annotate_debug_info
, this function:
- renders all content areas in ORANGE.
- renders all graphic cluster in content areas in GREEN.
- renders all tables in content areas in BLUE.
Parameters
- page: The page to be annotated.
- new_doc: The PDF document to copy the page to. If not provided, a new document is created.
- index: The index of the page in the new document.
Returns
The new document with the annotated page added.
def
format_document(document):
261def format_document(document): 262 html = etree.Element("html") 263 264 head = etree.Element("head") 265 html.append(head) 266 267 link = etree.Element("link") 268 link.set("rel", "stylesheet") 269 link.set("href", "../style.css") 270 head.append(link) 271 272 body = etree.Element("body") 273 html.append(body) 274 275 _format_html(body, document, with_newlines=True) 276 277 html = etree.ElementTree(html) 278 return html
def
write_html(html, path, pretty=True):
def
patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
86def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool: 87 if patch_file is None: 88 # First try the patch file for the specific version 89 patch_file = f"{doc.name}.patch" 90 if not pkg_file_exists(data_module, patch_file): 91 # Then try the patch file shared between versions 92 patch_file = f"{doc.name.split('-')[0]}.patch" 93 if not pkg_file_exists(data_module, patch_file): 94 return True 95 return pkg_apply_patch(data_module, patch_file, output_path) 96 return apply_patch(patch_file, output_path)