modm_data.pdf.document
PDF Documents
The PDF document is the root of the entire data structure and provides access to PDF metadata, the table of contents, as well as individual pages.
You should extend from this class for a specific vendor to provide the
correct page class from page()
function.
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF Documents 6 7The PDF document is the root of the entire data structure and provides access to 8PDF metadata, the table of contents, as well as individual pages. 9 10You should extend from this class for a specific vendor to provide the 11correct page class from `page()` function. 12""" 13 14 15import ctypes 16import logging 17import pypdfium2 as pp 18from typing import Iterator, Iterable 19from pathlib import Path 20from functools import cached_property, cache 21from collections import defaultdict 22from .page import Page 23 24LOGGER = logging.getLogger(__name__) 25 26 27# We cannot monkey patch this class, since it's a named tuple. :-( 28class _OutlineItem(pp.PdfOutlineItem): 29 def __hash__(self) -> int: 30 return hash(f"{self.page_index}+{self.title}") 31 32 def __eq__(self, other) -> bool: 33 if not isinstance(other, type(self)): return NotImplemented 34 return self.page_index == other.page_index and self.title == other.title 35 36 def __repr__(self) -> str: 37 return f"O({self.page_index}, {self.level}, {self.title})" 38 39 40class Document(pp.PdfDocument): 41 """ 42 This class is a convenience wrapper with caching around the high-level APIs 43 of pypdfium. 44 """ 45 def __init__(self, path: Path, autoclose: bool = False): 46 """ 47 :param path: Path to the PDF to open. 48 """ 49 path = Path(path) 50 self.name: str = path.stem 51 super().__init__(path, autoclose=autoclose) 52 """Stem of the document file name""" 53 self._path = path 54 self._bbox_cache = defaultdict(dict) 55 LOGGER.debug(f"Loading: {path}") 56 57 @cached_property 58 def metadata(self) -> dict[str, str]: 59 """The PDF metadata dictionary.""" 60 return self.get_metadata_dict() 61 62 @property 63 def destinations(self) -> Iterator[tuple[int, str]]: 64 """Yields (page 0-index, named destination) of the whole document.""" 65 for ii in range(pp.raw.FPDF_CountNamedDests(self)): 66 length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0) 67 clength = ctypes.c_long(length) 68 cbuffer = ctypes.create_string_buffer(length*2) 69 dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength) 70 name = cbuffer.raw[:clength.value*2].decode("utf-16-le").rstrip("\x00") 71 page = pp.raw.FPDFDest_GetDestPageIndex(self, dest) 72 yield (page, name) 73 74 @cached_property 75 def toc(self) -> list[pp.PdfOutlineItem]: 76 """ 77 The table of content as a sorted list of outline items ensuring item has 78 a page index by reusing the last one. 79 """ 80 tocs = set() 81 # Sometimes the TOC contains duplicates so we must use a set 82 last_page_index = 0 83 for toc in self.get_toc(): 84 outline = _OutlineItem(toc.level, toc.title, toc.is_closed, 85 toc.n_kids, toc.page_index or last_page_index, 86 toc.view_mode, toc.view_pos) 87 last_page_index = toc.page_index 88 tocs.add(outline) 89 return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title))) 90 91 @cached_property 92 def identifier_permanent(self) -> str: 93 """The permanent file identifier.""" 94 return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT) 95 96 @cached_property 97 def identifier_changing(self) -> str: 98 """The changing file identifier.""" 99 return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING) 100 101 @cached_property 102 def page_count(self) -> int: 103 """The number of pages in the document.""" 104 return pp.raw.FPDF_GetPageCount(self) 105 106 @cache 107 def page(self, index: int) -> Page: 108 """ 109 :param index: 0-indexed page number. 110 :return: the page object for the index. 111 """ 112 assert index < self.page_count 113 return Page(self, index) 114 115 def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]: 116 """ 117 :param numbers: an iterable range of page numbers (0-indexed!). 118 If `None`, then the whole page range is used. 119 :return: yields each page in the range. 120 """ 121 if numbers is None: 122 numbers = range(self.page_count) 123 for ii in numbers: 124 if 0 <= ii < self.page_count: 125 yield self.page(ii) 126 127 def __repr__(self) -> str: 128 return f"Doc({self.name})"
LOGGER =
<Logger modm_data.pdf.document (WARNING)>
class
Document(pypdfium2._helpers.document.PdfDocument):
41class Document(pp.PdfDocument): 42 """ 43 This class is a convenience wrapper with caching around the high-level APIs 44 of pypdfium. 45 """ 46 def __init__(self, path: Path, autoclose: bool = False): 47 """ 48 :param path: Path to the PDF to open. 49 """ 50 path = Path(path) 51 self.name: str = path.stem 52 super().__init__(path, autoclose=autoclose) 53 """Stem of the document file name""" 54 self._path = path 55 self._bbox_cache = defaultdict(dict) 56 LOGGER.debug(f"Loading: {path}") 57 58 @cached_property 59 def metadata(self) -> dict[str, str]: 60 """The PDF metadata dictionary.""" 61 return self.get_metadata_dict() 62 63 @property 64 def destinations(self) -> Iterator[tuple[int, str]]: 65 """Yields (page 0-index, named destination) of the whole document.""" 66 for ii in range(pp.raw.FPDF_CountNamedDests(self)): 67 length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0) 68 clength = ctypes.c_long(length) 69 cbuffer = ctypes.create_string_buffer(length*2) 70 dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength) 71 name = cbuffer.raw[:clength.value*2].decode("utf-16-le").rstrip("\x00") 72 page = pp.raw.FPDFDest_GetDestPageIndex(self, dest) 73 yield (page, name) 74 75 @cached_property 76 def toc(self) -> list[pp.PdfOutlineItem]: 77 """ 78 The table of content as a sorted list of outline items ensuring item has 79 a page index by reusing the last one. 80 """ 81 tocs = set() 82 # Sometimes the TOC contains duplicates so we must use a set 83 last_page_index = 0 84 for toc in self.get_toc(): 85 outline = _OutlineItem(toc.level, toc.title, toc.is_closed, 86 toc.n_kids, toc.page_index or last_page_index, 87 toc.view_mode, toc.view_pos) 88 last_page_index = toc.page_index 89 tocs.add(outline) 90 return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title))) 91 92 @cached_property 93 def identifier_permanent(self) -> str: 94 """The permanent file identifier.""" 95 return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT) 96 97 @cached_property 98 def identifier_changing(self) -> str: 99 """The changing file identifier.""" 100 return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING) 101 102 @cached_property 103 def page_count(self) -> int: 104 """The number of pages in the document.""" 105 return pp.raw.FPDF_GetPageCount(self) 106 107 @cache 108 def page(self, index: int) -> Page: 109 """ 110 :param index: 0-indexed page number. 111 :return: the page object for the index. 112 """ 113 assert index < self.page_count 114 return Page(self, index) 115 116 def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]: 117 """ 118 :param numbers: an iterable range of page numbers (0-indexed!). 119 If `None`, then the whole page range is used. 120 :return: yields each page in the range. 121 """ 122 if numbers is None: 123 numbers = range(self.page_count) 124 for ii in numbers: 125 if 0 <= ii < self.page_count: 126 yield self.page(ii) 127 128 def __repr__(self) -> str: 129 return f"Doc({self.name})"
This class is a convenience wrapper with caching around the high-level APIs of pypdfium.
Document(path: pathlib.Path, autoclose: bool = False)
46 def __init__(self, path: Path, autoclose: bool = False): 47 """ 48 :param path: Path to the PDF to open. 49 """ 50 path = Path(path) 51 self.name: str = path.stem 52 super().__init__(path, autoclose=autoclose) 53 """Stem of the document file name""" 54 self._path = path 55 self._bbox_cache = defaultdict(dict) 56 LOGGER.debug(f"Loading: {path}")
Parameters
- path: Path to the PDF to open.
metadata: dict[str, str]
58 @cached_property 59 def metadata(self) -> dict[str, str]: 60 """The PDF metadata dictionary.""" 61 return self.get_metadata_dict()
The PDF metadata dictionary.
destinations: Iterator[tuple[int, str]]
63 @property 64 def destinations(self) -> Iterator[tuple[int, str]]: 65 """Yields (page 0-index, named destination) of the whole document.""" 66 for ii in range(pp.raw.FPDF_CountNamedDests(self)): 67 length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0) 68 clength = ctypes.c_long(length) 69 cbuffer = ctypes.create_string_buffer(length*2) 70 dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength) 71 name = cbuffer.raw[:clength.value*2].decode("utf-16-le").rstrip("\x00") 72 page = pp.raw.FPDFDest_GetDestPageIndex(self, dest) 73 yield (page, name)
Yields (page 0-index, named destination) of the whole document.
toc: list[pypdfium2._helpers.document.PdfOutlineItem]
75 @cached_property 76 def toc(self) -> list[pp.PdfOutlineItem]: 77 """ 78 The table of content as a sorted list of outline items ensuring item has 79 a page index by reusing the last one. 80 """ 81 tocs = set() 82 # Sometimes the TOC contains duplicates so we must use a set 83 last_page_index = 0 84 for toc in self.get_toc(): 85 outline = _OutlineItem(toc.level, toc.title, toc.is_closed, 86 toc.n_kids, toc.page_index or last_page_index, 87 toc.view_mode, toc.view_pos) 88 last_page_index = toc.page_index 89 tocs.add(outline) 90 return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
The table of content as a sorted list of outline items ensuring item has a page index by reusing the last one.
identifier_permanent: str
92 @cached_property 93 def identifier_permanent(self) -> str: 94 """The permanent file identifier.""" 95 return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
The permanent file identifier.
identifier_changing: str
97 @cached_property 98 def identifier_changing(self) -> str: 99 """The changing file identifier.""" 100 return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
The changing file identifier.
page_count: int
102 @cached_property 103 def page_count(self) -> int: 104 """The number of pages in the document.""" 105 return pp.raw.FPDF_GetPageCount(self)
The number of pages in the document.
107 @cache 108 def page(self, index: int) -> Page: 109 """ 110 :param index: 0-indexed page number. 111 :return: the page object for the index. 112 """ 113 assert index < self.page_count 114 return Page(self, index)
Parameters
- index: 0-indexed page number.
Returns
the page object for the index.
116 def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]: 117 """ 118 :param numbers: an iterable range of page numbers (0-indexed!). 119 If `None`, then the whole page range is used. 120 :return: yields each page in the range. 121 """ 122 if numbers is None: 123 numbers = range(self.page_count) 124 for ii in numbers: 125 if 0 <= ii < self.page_count: 126 yield self.page(ii)
Parameters
- numbers: an iterable range of page numbers (0-indexed!).
If
None
, then the whole page range is used.
Returns
yields each page in the range.
Inherited Members
- pypdfium2._helpers.document.PdfDocument
- formenv
- parent
- new
- init_forms
- get_formtype
- get_pagemode
- is_tagged
- save
- get_identifier
- get_version
- get_metadata_value
- METADATA_KEYS
- get_metadata_dict
- count_attachments
- get_attachment
- new_attachment
- del_attachment
- get_page
- new_page
- del_page
- import_pages
- get_page_size
- get_page_label
- page_as_xobject
- get_toc
- render
- pypdfium2.internal.bases.AutoCloseable
- close