modm_data.pdf.document

PDF Documents

The PDF document is the root of the entire data structure and provides access to PDF metadata, the table of contents, as well as individual pages.

You should extend from this class for a specific vendor to provide the correct page class from page() function.

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4"""
  5# PDF Documents
  6
  7The PDF document is the root of the entire data structure and provides access to
  8PDF metadata, the table of contents, as well as individual pages.
  9
 10You should extend from this class for a specific vendor to provide the
 11correct page class from `page()` function.
 12"""
 13
 14
 15import ctypes
 16import logging
 17import pypdfium2 as pp
 18from typing import Iterator, Iterable
 19from pathlib import Path
 20from functools import cached_property, cache
 21from collections import defaultdict
 22from .page import Page
 23
 24LOGGER = logging.getLogger(__name__)
 25
 26
 27# We cannot monkey patch this class, since it's a named tuple. :-(
 28class _OutlineItem(pp.PdfOutlineItem):
 29    def __hash__(self) -> int:
 30        return hash(f"{self.page_index}+{self.title}")
 31
 32    def __eq__(self, other) -> bool:
 33        if not isinstance(other, type(self)): return NotImplemented
 34        return self.page_index == other.page_index and self.title == other.title
 35
 36    def __repr__(self) -> str:
 37        return f"O({self.page_index}, {self.level}, {self.title})"
 38
 39
 40class Document(pp.PdfDocument):
 41    """
 42    This class is a convenience wrapper with caching around the high-level APIs
 43    of pypdfium.
 44    """
 45    def __init__(self, path: Path, autoclose: bool = False):
 46        """
 47        :param path: Path to the PDF to open.
 48        """
 49        path = Path(path)
 50        self.name: str = path.stem
 51        super().__init__(path, autoclose=autoclose)
 52        """Stem of the document file name"""
 53        self._path = path
 54        self._bbox_cache = defaultdict(dict)
 55        LOGGER.debug(f"Loading: {path}")
 56
 57    @cached_property
 58    def metadata(self) -> dict[str, str]:
 59        """The PDF metadata dictionary."""
 60        return self.get_metadata_dict()
 61
 62    @property
 63    def destinations(self) -> Iterator[tuple[int, str]]:
 64        """Yields (page 0-index, named destination) of the whole document."""
 65        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
 66            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
 67            clength = ctypes.c_long(length)
 68            cbuffer = ctypes.create_string_buffer(length*2)
 69            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
 70            name = cbuffer.raw[:clength.value*2].decode("utf-16-le").rstrip("\x00")
 71            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
 72            yield (page, name)
 73
 74    @cached_property
 75    def toc(self) -> list[pp.PdfOutlineItem]:
 76        """
 77        The table of content as a sorted list of outline items ensuring item has
 78        a page index by reusing the last one.
 79        """
 80        tocs = set()
 81        # Sometimes the TOC contains duplicates so we must use a set
 82        last_page_index = 0
 83        for toc in self.get_toc():
 84            outline = _OutlineItem(toc.level, toc.title, toc.is_closed,
 85                                   toc.n_kids, toc.page_index or last_page_index,
 86                                   toc.view_mode, toc.view_pos)
 87            last_page_index = toc.page_index
 88            tocs.add(outline)
 89        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
 90
 91    @cached_property
 92    def identifier_permanent(self) -> str:
 93        """The permanent file identifier."""
 94        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
 95
 96    @cached_property
 97    def identifier_changing(self) -> str:
 98        """The changing file identifier."""
 99        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
100
101    @cached_property
102    def page_count(self) -> int:
103        """The number of pages in the document."""
104        return pp.raw.FPDF_GetPageCount(self)
105
106    @cache
107    def page(self, index: int) -> Page:
108        """
109        :param index: 0-indexed page number.
110        :return: the page object for the index.
111        """
112        assert index < self.page_count
113        return Page(self, index)
114
115    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
116        """
117        :param numbers: an iterable range of page numbers (0-indexed!).
118                        If `None`, then the whole page range is used.
119        :return: yields each page in the range.
120        """
121        if numbers is None:
122            numbers = range(self.page_count)
123        for ii in numbers:
124            if 0 <= ii < self.page_count:
125                yield self.page(ii)
126
127    def __repr__(self) -> str:
128        return f"Doc({self.name})"
LOGGER = <Logger modm_data.pdf.document (WARNING)>
class Document(pypdfium2._helpers.document.PdfDocument):
 41class Document(pp.PdfDocument):
 42    """
 43    This class is a convenience wrapper with caching around the high-level APIs
 44    of pypdfium.
 45    """
 46    def __init__(self, path: Path, autoclose: bool = False):
 47        """
 48        :param path: Path to the PDF to open.
 49        """
 50        path = Path(path)
 51        self.name: str = path.stem
 52        super().__init__(path, autoclose=autoclose)
 53        """Stem of the document file name"""
 54        self._path = path
 55        self._bbox_cache = defaultdict(dict)
 56        LOGGER.debug(f"Loading: {path}")
 57
 58    @cached_property
 59    def metadata(self) -> dict[str, str]:
 60        """The PDF metadata dictionary."""
 61        return self.get_metadata_dict()
 62
 63    @property
 64    def destinations(self) -> Iterator[tuple[int, str]]:
 65        """Yields (page 0-index, named destination) of the whole document."""
 66        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
 67            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
 68            clength = ctypes.c_long(length)
 69            cbuffer = ctypes.create_string_buffer(length*2)
 70            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
 71            name = cbuffer.raw[:clength.value*2].decode("utf-16-le").rstrip("\x00")
 72            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
 73            yield (page, name)
 74
 75    @cached_property
 76    def toc(self) -> list[pp.PdfOutlineItem]:
 77        """
 78        The table of content as a sorted list of outline items ensuring item has
 79        a page index by reusing the last one.
 80        """
 81        tocs = set()
 82        # Sometimes the TOC contains duplicates so we must use a set
 83        last_page_index = 0
 84        for toc in self.get_toc():
 85            outline = _OutlineItem(toc.level, toc.title, toc.is_closed,
 86                                   toc.n_kids, toc.page_index or last_page_index,
 87                                   toc.view_mode, toc.view_pos)
 88            last_page_index = toc.page_index
 89            tocs.add(outline)
 90        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
 91
 92    @cached_property
 93    def identifier_permanent(self) -> str:
 94        """The permanent file identifier."""
 95        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
 96
 97    @cached_property
 98    def identifier_changing(self) -> str:
 99        """The changing file identifier."""
100        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
101
102    @cached_property
103    def page_count(self) -> int:
104        """The number of pages in the document."""
105        return pp.raw.FPDF_GetPageCount(self)
106
107    @cache
108    def page(self, index: int) -> Page:
109        """
110        :param index: 0-indexed page number.
111        :return: the page object for the index.
112        """
113        assert index < self.page_count
114        return Page(self, index)
115
116    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
117        """
118        :param numbers: an iterable range of page numbers (0-indexed!).
119                        If `None`, then the whole page range is used.
120        :return: yields each page in the range.
121        """
122        if numbers is None:
123            numbers = range(self.page_count)
124        for ii in numbers:
125            if 0 <= ii < self.page_count:
126                yield self.page(ii)
127
128    def __repr__(self) -> str:
129        return f"Doc({self.name})"

This class is a convenience wrapper with caching around the high-level APIs of pypdfium.

Document(path: pathlib.Path, autoclose: bool = False)
46    def __init__(self, path: Path, autoclose: bool = False):
47        """
48        :param path: Path to the PDF to open.
49        """
50        path = Path(path)
51        self.name: str = path.stem
52        super().__init__(path, autoclose=autoclose)
53        """Stem of the document file name"""
54        self._path = path
55        self._bbox_cache = defaultdict(dict)
56        LOGGER.debug(f"Loading: {path}")
Parameters
  • path: Path to the PDF to open.
name: str
metadata: dict[str, str]
58    @cached_property
59    def metadata(self) -> dict[str, str]:
60        """The PDF metadata dictionary."""
61        return self.get_metadata_dict()

The PDF metadata dictionary.

destinations: Iterator[tuple[int, str]]
63    @property
64    def destinations(self) -> Iterator[tuple[int, str]]:
65        """Yields (page 0-index, named destination) of the whole document."""
66        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
67            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
68            clength = ctypes.c_long(length)
69            cbuffer = ctypes.create_string_buffer(length*2)
70            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
71            name = cbuffer.raw[:clength.value*2].decode("utf-16-le").rstrip("\x00")
72            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
73            yield (page, name)

Yields (page 0-index, named destination) of the whole document.

toc: list[pypdfium2._helpers.document.PdfOutlineItem]
75    @cached_property
76    def toc(self) -> list[pp.PdfOutlineItem]:
77        """
78        The table of content as a sorted list of outline items ensuring item has
79        a page index by reusing the last one.
80        """
81        tocs = set()
82        # Sometimes the TOC contains duplicates so we must use a set
83        last_page_index = 0
84        for toc in self.get_toc():
85            outline = _OutlineItem(toc.level, toc.title, toc.is_closed,
86                                   toc.n_kids, toc.page_index or last_page_index,
87                                   toc.view_mode, toc.view_pos)
88            last_page_index = toc.page_index
89            tocs.add(outline)
90        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))

The table of content as a sorted list of outline items ensuring item has a page index by reusing the last one.

identifier_permanent: str
92    @cached_property
93    def identifier_permanent(self) -> str:
94        """The permanent file identifier."""
95        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)

The permanent file identifier.

identifier_changing: str
 97    @cached_property
 98    def identifier_changing(self) -> str:
 99        """The changing file identifier."""
100        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)

The changing file identifier.

page_count: int
102    @cached_property
103    def page_count(self) -> int:
104        """The number of pages in the document."""
105        return pp.raw.FPDF_GetPageCount(self)

The number of pages in the document.

@cache
def page(self, index: int) -> modm_data.pdf.page.Page:
107    @cache
108    def page(self, index: int) -> Page:
109        """
110        :param index: 0-indexed page number.
111        :return: the page object for the index.
112        """
113        assert index < self.page_count
114        return Page(self, index)
Parameters
  • index: 0-indexed page number.
Returns

the page object for the index.

def pages(self, numbers: Iterable[int] = None) -> Iterator[modm_data.pdf.page.Page]:
116    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
117        """
118        :param numbers: an iterable range of page numbers (0-indexed!).
119                        If `None`, then the whole page range is used.
120        :return: yields each page in the range.
121        """
122        if numbers is None:
123            numbers = range(self.page_count)
124        for ii in numbers:
125            if 0 <= ii < self.page_count:
126                yield self.page(ii)
Parameters
  • numbers: an iterable range of page numbers (0-indexed!). If None, then the whole page range is used.
Returns

yields each page in the range.

Inherited Members
pypdfium2._helpers.document.PdfDocument
formenv
parent
new
init_forms
get_formtype
get_pagemode
is_tagged
save
get_identifier
get_version
get_metadata_value
METADATA_KEYS
get_metadata_dict
count_attachments
get_attachment
new_attachment
del_attachment
get_page
new_page
del_page
import_pages
get_page_size
get_page_label
page_as_xobject
get_toc
render
pypdfium2.internal.bases.AutoCloseable
close