modm_data.pdf2html.convert
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4from anytree import RenderTree 5 6from .html import format_document, write_html 7from .render import render_page_pdf 8from ..utils import pkg_apply_patch, pkg_file_exists, apply_patch 9from .ast import merge_area 10from pathlib import Path 11import pypdfium2 as pp 12 13 14def convert( 15 doc, 16 page_range, 17 output_path, 18 format_chapters=False, 19 pretty=True, 20 render_html=True, 21 render_pdf=False, 22 render_all=False, 23 show_ast=False, 24 show_tree=False, 25 show_tags=False, 26) -> bool: 27 document = None 28 debug_doc = None 29 debug_index = 0 30 for page in doc.pages(page_range): 31 if not render_all and not page.is_relevant: 32 continue 33 print(f"\n\n=== {page.top} #{page.number} ===\n") 34 35 if show_tags: 36 for struct in page.structures: 37 print(struct.descr()) 38 39 if show_tree or render_html or show_ast: 40 areas = page.content_ast 41 if show_ast: 42 print() 43 for area in areas: 44 print(RenderTree(area)) 45 if show_tree or render_html: 46 for area in areas: 47 document = merge_area(document, area) 48 49 if render_pdf: 50 debug_doc = render_page_pdf(doc, page, debug_doc, debug_index) 51 debug_index += 1 52 53 if render_pdf: 54 with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle: 55 pp.PdfDocument(debug_doc).save(file_handle) 56 57 if show_tree or render_html: 58 if document is None: 59 print("No pages parsed, empty document!") 60 return True 61 62 document = doc._normalize(document) 63 if show_tree: 64 print(RenderTree(document)) 65 66 if render_html: 67 if format_chapters: 68 for chapter in document.children: 69 if chapter.name == "chapter": 70 print(f"\nFormatting HTML for '{chapter.title}'") 71 html = format_document(chapter) 72 output_file = f"{output_path}/chapter_{chapter._filename}.html" 73 print(f"\nWriting HTML '{output_file}'") 74 write_html(html, output_file, pretty=pretty) 75 else: 76 print("\nFormatting HTML") 77 html = format_document(document) 78 print(f"\nWriting HTML '{str(output_path)}'") 79 write_html(html, str(output_path), pretty=pretty) 80 81 return True 82 83 84def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool: 85 if patch_file is None: 86 # First try the patch file for the specific version 87 patch_file = f"{doc.name}.patch" 88 if not pkg_file_exists(data_module, patch_file): 89 # Then try the patch file shared between versions 90 patch_file = f"{doc.name.split('-')[0]}.patch" 91 if not pkg_file_exists(data_module, patch_file): 92 return True 93 return pkg_apply_patch(data_module, patch_file, output_path) 94 return apply_patch(patch_file, output_path)
def
convert( doc, page_range, output_path, format_chapters=False, pretty=True, render_html=True, render_pdf=False, render_all=False, show_ast=False, show_tree=False, show_tags=False) -> bool:
15def convert( 16 doc, 17 page_range, 18 output_path, 19 format_chapters=False, 20 pretty=True, 21 render_html=True, 22 render_pdf=False, 23 render_all=False, 24 show_ast=False, 25 show_tree=False, 26 show_tags=False, 27) -> bool: 28 document = None 29 debug_doc = None 30 debug_index = 0 31 for page in doc.pages(page_range): 32 if not render_all and not page.is_relevant: 33 continue 34 print(f"\n\n=== {page.top} #{page.number} ===\n") 35 36 if show_tags: 37 for struct in page.structures: 38 print(struct.descr()) 39 40 if show_tree or render_html or show_ast: 41 areas = page.content_ast 42 if show_ast: 43 print() 44 for area in areas: 45 print(RenderTree(area)) 46 if show_tree or render_html: 47 for area in areas: 48 document = merge_area(document, area) 49 50 if render_pdf: 51 debug_doc = render_page_pdf(doc, page, debug_doc, debug_index) 52 debug_index += 1 53 54 if render_pdf: 55 with open(f"debug_{output_path.stem}.pdf", "wb") as file_handle: 56 pp.PdfDocument(debug_doc).save(file_handle) 57 58 if show_tree or render_html: 59 if document is None: 60 print("No pages parsed, empty document!") 61 return True 62 63 document = doc._normalize(document) 64 if show_tree: 65 print(RenderTree(document)) 66 67 if render_html: 68 if format_chapters: 69 for chapter in document.children: 70 if chapter.name == "chapter": 71 print(f"\nFormatting HTML for '{chapter.title}'") 72 html = format_document(chapter) 73 output_file = f"{output_path}/chapter_{chapter._filename}.html" 74 print(f"\nWriting HTML '{output_file}'") 75 write_html(html, output_file, pretty=pretty) 76 else: 77 print("\nFormatting HTML") 78 html = format_document(document) 79 print(f"\nWriting HTML '{str(output_path)}'") 80 write_html(html, str(output_path), pretty=pretty) 81 82 return True
def
patch( doc, data_module, output_path: pathlib.Path, patch_file: pathlib.Path = None) -> bool:
85def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool: 86 if patch_file is None: 87 # First try the patch file for the specific version 88 patch_file = f"{doc.name}.patch" 89 if not pkg_file_exists(data_module, patch_file): 90 # Then try the patch file shared between versions 91 patch_file = f"{doc.name.split('-')[0]}.patch" 92 if not pkg_file_exists(data_module, patch_file): 93 return True 94 return pkg_apply_patch(data_module, patch_file, output_path) 95 return apply_patch(patch_file, output_path)