modm_data.pdf2html.convert
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4from anytree import RenderTree 5from typing import Iterable 6 7from .html import format_document, write_html 8from .render import annotate_debug_info 9from ..utils import pkg_apply_patch, pkg_file_exists, apply_patch 10from .ast import merge_area 11from pathlib import Path 12import pypdfium2 as pp 13 14 15def convert( 16 doc: pp.PdfDocument, 17 page_range: Iterable[int], 18 output_path: Path, 19 format_chapters: bool = False, 20 pretty: bool = True, 21 render_html: bool = True, 22 render_pdf: bool = False, 23 render_all: bool = False, 24 show_ast: bool = False, 25 show_tree: bool = False, 26 show_tags: bool = False, 27) -> bool: 28 document = None 29 debug_doc = None 30 debug_index = 0 31 for page in doc.pages(page_range): 32 if not render_all and not page.is_relevant: 33 continue 34 print(f"\n\n=== {page.top} #{page.number} ===\n") 35 36 if show_tags: 37 for struct in page.structures: 38 print(struct.descr()) 39 40 if show_tree or render_html or show_ast: 41 areas = page.content_ast 42 if show_ast: 43 print() 44 for area in areas: 45 print(RenderTree(area)) 46 if show_tree or render_html: 47 for area in areas: 48 document = merge_area(document, area) 49 50 if render_pdf: 51 debug_doc = annotate_debug_info(page, debug_doc, debug_index) 52 debug_index += 1 53 54 if render_pdf: 55 with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle: 56 pp.PdfDocument(debug_doc).save(file_handle) 57 58 if show_tree or render_html: 59 if document is None: 60 print("No pages parsed, empty document!") 61 return True 62 63 document = doc._normalize(document) 64 if show_tree: 65 print(RenderTree(document)) 66 67 if render_html: 68 if format_chapters: 69 for chapter in document.children: 70 if chapter.name == "chapter": 71 print(f"\nFormatting HTML for '{chapter.title}'") 72 html = format_document(chapter) 73 output_file = f"{output_path}/chapter_{chapter._filename}.html" 74 print(f"\nWriting HTML '{output_file}'") 75 write_html(html, output_file, pretty=pretty) 76 else: 77 print("\nFormatting HTML") 78 html = format_document(document) 79 print(f"\nWriting HTML '{str(output_path)}'") 80 write_html(html, str(output_path), pretty=pretty) 81 82 return True 83 84 85def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool: 86 if patch_file is None: 87 # First try the patch file for the specific version 88 patch_file = f"{doc.name}.patch" 89 if not pkg_file_exists(data_module, patch_file): 90 # Then try the patch file shared between versions 91 patch_file = f"{doc.name.split('-')[0]}.patch" 92 if not pkg_file_exists(data_module, patch_file): 93 return True 94 return pkg_apply_patch(data_module, patch_file, output_path) 95 return apply_patch(patch_file, output_path)
def
convert( doc: pypdfium2._helpers.document.PdfDocument, page_range: Iterable[int], output_path: pathlib.Path, format_chapters: bool = False, pretty: bool = True, render_html: bool = True, render_pdf: bool = False, render_all: bool = False, show_ast: bool = False, show_tree: bool = False, show_tags: bool = False) -> bool:
16def convert( 17 doc: pp.PdfDocument, 18 page_range: Iterable[int], 19 output_path: Path, 20 format_chapters: bool = False, 21 pretty: bool = True, 22 render_html: bool = True, 23 render_pdf: bool = False, 24 render_all: bool = False, 25 show_ast: bool = False, 26 show_tree: bool = False, 27 show_tags: bool = False, 28) -> bool: 29 document = None 30 debug_doc = None 31 debug_index = 0 32 for page in doc.pages(page_range): 33 if not render_all and not page.is_relevant: 34 continue 35 print(f"\n\n=== {page.top} #{page.number} ===\n") 36 37 if show_tags: 38 for struct in page.structures: 39 print(struct.descr()) 40 41 if show_tree or render_html or show_ast: 42 areas = page.content_ast 43 if show_ast: 44 print() 45 for area in areas: 46 print(RenderTree(area)) 47 if show_tree or render_html: 48 for area in areas: 49 document = merge_area(document, area) 50 51 if render_pdf: 52 debug_doc = annotate_debug_info(page, debug_doc, debug_index) 53 debug_index += 1 54 55 if render_pdf: 56 with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle: 57 pp.PdfDocument(debug_doc).save(file_handle) 58 59 if show_tree or render_html: 60 if document is None: 61 print("No pages parsed, empty document!") 62 return True 63 64 document = doc._normalize(document) 65 if show_tree: 66 print(RenderTree(document)) 67 68 if render_html: 69 if format_chapters: 70 for chapter in document.children: 71 if chapter.name == "chapter": 72 print(f"\nFormatting HTML for '{chapter.title}'") 73 html = format_document(chapter) 74 output_file = f"{output_path}/chapter_{chapter._filename}.html" 75 print(f"\nWriting HTML '{output_file}'") 76 write_html(html, output_file, pretty=pretty) 77 else: 78 print("\nFormatting HTML") 79 html = format_document(document) 80 print(f"\nWriting HTML '{str(output_path)}'") 81 write_html(html, str(output_path), pretty=pretty) 82 83 return True
def
patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
86def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool: 87 if patch_file is None: 88 # First try the patch file for the specific version 89 patch_file = f"{doc.name}.patch" 90 if not pkg_file_exists(data_module, patch_file): 91 # Then try the patch file shared between versions 92 patch_file = f"{doc.name.split('-')[0]}.patch" 93 if not pkg_file_exists(data_module, patch_file): 94 return True 95 return pkg_apply_patch(data_module, patch_file, output_path) 96 return apply_patch(patch_file, output_path)