modm_data.pdf.document

PDF Documents

The PDF document is the root of the entire data structure and provides access to PDF metadata, the table of contents, as well as individual pages.
You should extend from this class for a specific vendor to provide the correct page class from page() function.
View Source
  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4"""
  5# PDF Documents
  6
  7The PDF document is the root of the entire data structure and provides access to
  8PDF metadata, the table of contents, as well as individual pages.
  9
 10You should extend from this class for a specific vendor to provide the
 11correct page class from `page()` function.
 12"""
 13
 14
 15import ctypes
 16import logging
 17import pypdfium2 as pp
 18from typing import Iterator, Iterable
 19from pathlib import Path
 20from functools import cached_property, cache
 21from collections import defaultdict
 22from .page import Page
 23
 24LOGGER = logging.getLogger(__name__)
 25
 26
 27# We cannot monkey patch this class, since it's a named tuple. :-(
 28class _OutlineItem(pp.PdfOutlineItem):
 29    def __hash__(self) -> int:
 30        return hash(f"{self.page_index}+{self.title}")
 31
 32    def __eq__(self, other) -> bool:
 33        if not isinstance(other, type(self)): return NotImplemented
 34        return self.page_index == other.page_index and self.title == other.title
 35
 36    def __repr__(self) -> str:
 37        return f"O({self.page_index}, {self.level}, {self.title})"
 38
 39
 40class Document(pp.PdfDocument):
 41    """
 42    This class is a convenience wrapper with caching around the high-level APIs
 43    of pypdfium.
 44    """
 45    def __init__(self, path: Path, autoclose: bool = False):
 46        """
 47        :param path: Path to the PDF to open.
 48        """
 49        path = Path(path)
 50        self.name: str = path.stem
 51        super().__init__(path, autoclose=autoclose)
 52        """Stem of the document file name"""
 53        self._path = path
 54        self._bbox_cache = defaultdict(dict)
 55        LOGGER.debug(f"Loading: {path}")
 56
 57    @cached_property
 58    def metadata(self) -> dict[str, str]:
 59        """The PDF metadata dictionary."""
 60        return self.get_metadata_dict()
 61
 62    @property
 63    def destinations(self) -> Iterator[tuple[int, str]]:
 64        """Yields (page 0-index, named destination) of the whole document."""
 65        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
 66            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
 67            clength = ctypes.c_long(length)
 68            cbuffer = ctypes.create_string_buffer(length*2)
 69            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
 70            name = cbuffer.raw[:clength.value*2].decode("utf-16-le").rstrip("\x00")
 71            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
 72            yield (page, name)
 73
 74    @cached_property
 75    def toc(self) -> list[pp.PdfOutlineItem]:
 76        """
 77        The table of content as a sorted list of outline items ensuring item has
 78        a page index by reusing the last one.
 79        """
 80        tocs = set()
 81        # Sometimes the TOC contains duplicates so we must use a set
 82        last_page_index = 0
 83        for toc in self.get_toc():
 84            outline = _OutlineItem(toc.level, toc.title, toc.is_closed,
 85                                   toc.n_kids, toc.page_index or last_page_index,
 86                                   toc.view_mode, toc.view_pos)
 87            last_page_index = toc.page_index
 88            tocs.add(outline)
 89        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
 90
 91    @cached_property
 92    def identifier_permanent(self) -> str:
 93        """The permanent file identifier."""
 94        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
 95
 96    @cached_property
 97    def identifier_changing(self) -> str:
 98        """The changing file identifier."""
 99        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
100
101    @cached_property
102    def page_count(self) -> int:
103        """The number of pages in the document."""
104        return pp.raw.FPDF_GetPageCount(self)
105
106    @cache
107    def page(self, index: int) -> Page:
108        """
109        :param index: 0-indexed page number.
110        :return: the page object for the index.
111        """
112        assert index < self.page_count
113        return Page(self, index)
114
115    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
116        """
117        :param numbers: an iterable range of page numbers (0-indexed!).
118                        If `None`, then the whole page range is used.
119        :return: yields each page in the range.
120        """
121        if numbers is None:
122            numbers = range(self.page_count)
123        for ii in numbers:
124            if 0 <= ii < self.page_count:
125                yield self.page(ii)
126
127    def __repr__(self) -> str:
128        return f"Doc({self.name})"
LOGGER = <Logger modm_data.pdf.document (WARNING)>
modm_data.pdf.document

PDF Documents

Parameters

Parameters

Returns

Parameters

Returns

Inherited Members