modm_data.pdf

PDF Content Accessors

This module extends the pypdfium2 Python API with low-level accessors for characters and graphics. Note that these modules support read-only access to PDFs, since a lot of caching is used to speed up commonly accessed properties.

This module only contains formatting independent PDF access which is then specialized in the vendor-specific modm_data.pdf2html modules.

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4"""
 5# PDF Content Accessors
 6
 7This module extends the pypdfium2 Python API with low-level accessors for
 8characters and graphics. Note that these modules support read-only access to
 9PDFs, since a lot of caching is used to speed up commonly accessed properties.
10
11This module only contains formatting independent PDF access which is then
12specialized in the vendor-specific `modm_data.pdf2html` modules.
13"""
14
15from .document import Document
16from .page import Page
17from .character import Character
18from .link import ObjLink, WebLink
19from .path import Path
20from .image import Image
21from .render import render_page_pdf
22from .structure import Structure
23
24__all__ = [
25    "Document",
26    "Page",
27    "Character",
28    "ObjLink",
29    "WebLink",
30    "Path",
31    "Image",
32    "Structure",
33    "render_page_pdf",
34]
class Document(pypdfium2._helpers.document.PdfDocument):
 41class Document(pp.PdfDocument):
 42    """
 43    This class is a convenience wrapper with caching around the high-level APIs
 44    of pypdfium.
 45    """
 46
 47    def __init__(self, path: Path, autoclose: bool = False):
 48        """
 49        :param path: Path to the PDF to open.
 50        """
 51        path = Path(path)
 52        self.name: str = path.stem
 53        """Stem of the document file name"""
 54        super().__init__(path, autoclose=autoclose)
 55        self._path = path
 56        self._bbox_cache = defaultdict(dict)
 57        _LOGGER.debug(f"Loading: {path}")
 58
 59    @cached_property
 60    def metadata(self) -> dict[str, str]:
 61        """The PDF metadata dictionary."""
 62        return self.get_metadata_dict()
 63
 64    @property
 65    def destinations(self) -> Iterator[tuple[int, str]]:
 66        """Yields (page 0-index, named destination) of the whole document."""
 67        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
 68            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
 69            clength = ctypes.c_long(length)
 70            cbuffer = ctypes.create_string_buffer(length * 2)
 71            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
 72            name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00")
 73            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
 74            yield (page, name)
 75
 76    @cached_property
 77    def toc(self) -> list[pp.PdfOutlineItem]:
 78        """
 79        The table of content as a sorted list of outline items ensuring item has
 80        a page index by reusing the last one.
 81        """
 82        tocs = set()
 83        # Sometimes the TOC contains duplicates so we must use a set
 84        last_page_index = 0
 85        for toc in self.get_toc():
 86            outline = _OutlineItem(
 87                toc.level,
 88                toc.title,
 89                toc.is_closed,
 90                toc.n_kids,
 91                toc.page_index or last_page_index,
 92                toc.view_mode,
 93                toc.view_pos,
 94            )
 95            last_page_index = toc.page_index or last_page_index
 96            tocs.add(outline)
 97        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
 98
 99    @cached_property
100    def identifier_permanent(self) -> str:
101        """The permanent file identifier."""
102        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
103
104    @cached_property
105    def identifier_changing(self) -> str:
106        """The changing file identifier."""
107        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
108
109    @cached_property
110    def page_count(self) -> int:
111        """The number of pages in the document."""
112        return pp.raw.FPDF_GetPageCount(self)
113
114    @cache
115    def page(self, index: int) -> Page:
116        """
117        :param index: 0-indexed page number.
118        :return: the page object for the index.
119        """
120        assert index < self.page_count
121        return Page(self, index)
122
123    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
124        """
125        :param numbers: an iterable range of page numbers (0-indexed!).
126                        If `None`, then the whole page range is used.
127        :return: yields each page in the range.
128        """
129        if numbers is None:
130            numbers = range(self.page_count)
131        for ii in numbers:
132            if 0 <= ii < self.page_count:
133                yield self.page(ii)
134
135    def __repr__(self) -> str:
136        return f"Doc({self.name})"

This class is a convenience wrapper with caching around the high-level APIs of pypdfium.

Document(path: pathlib.Path, autoclose: bool = False)
47    def __init__(self, path: Path, autoclose: bool = False):
48        """
49        :param path: Path to the PDF to open.
50        """
51        path = Path(path)
52        self.name: str = path.stem
53        """Stem of the document file name"""
54        super().__init__(path, autoclose=autoclose)
55        self._path = path
56        self._bbox_cache = defaultdict(dict)
57        _LOGGER.debug(f"Loading: {path}")
Parameters
  • path: Path to the PDF to open.
name: str

Stem of the document file name

metadata: dict[str, str]
59    @cached_property
60    def metadata(self) -> dict[str, str]:
61        """The PDF metadata dictionary."""
62        return self.get_metadata_dict()

The PDF metadata dictionary.

destinations: Iterator[tuple[int, str]]
64    @property
65    def destinations(self) -> Iterator[tuple[int, str]]:
66        """Yields (page 0-index, named destination) of the whole document."""
67        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
68            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
69            clength = ctypes.c_long(length)
70            cbuffer = ctypes.create_string_buffer(length * 2)
71            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
72            name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00")
73            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
74            yield (page, name)

Yields (page 0-index, named destination) of the whole document.

toc: list[pypdfium2._helpers.document.PdfOutlineItem]
76    @cached_property
77    def toc(self) -> list[pp.PdfOutlineItem]:
78        """
79        The table of content as a sorted list of outline items ensuring item has
80        a page index by reusing the last one.
81        """
82        tocs = set()
83        # Sometimes the TOC contains duplicates so we must use a set
84        last_page_index = 0
85        for toc in self.get_toc():
86            outline = _OutlineItem(
87                toc.level,
88                toc.title,
89                toc.is_closed,
90                toc.n_kids,
91                toc.page_index or last_page_index,
92                toc.view_mode,
93                toc.view_pos,
94            )
95            last_page_index = toc.page_index or last_page_index
96            tocs.add(outline)
97        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))

The table of content as a sorted list of outline items ensuring item has a page index by reusing the last one.

identifier_permanent: str
 99    @cached_property
100    def identifier_permanent(self) -> str:
101        """The permanent file identifier."""
102        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)

The permanent file identifier.

identifier_changing: str
104    @cached_property
105    def identifier_changing(self) -> str:
106        """The changing file identifier."""
107        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)

The changing file identifier.

page_count: int
109    @cached_property
110    def page_count(self) -> int:
111        """The number of pages in the document."""
112        return pp.raw.FPDF_GetPageCount(self)

The number of pages in the document.

@cache
def page(self, index: int) -> Page:
114    @cache
115    def page(self, index: int) -> Page:
116        """
117        :param index: 0-indexed page number.
118        :return: the page object for the index.
119        """
120        assert index < self.page_count
121        return Page(self, index)
Parameters
  • index: 0-indexed page number.
Returns

the page object for the index.

def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
123    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
124        """
125        :param numbers: an iterable range of page numbers (0-indexed!).
126                        If `None`, then the whole page range is used.
127        :return: yields each page in the range.
128        """
129        if numbers is None:
130            numbers = range(self.page_count)
131        for ii in numbers:
132            if 0 <= ii < self.page_count:
133                yield self.page(ii)
Parameters
  • numbers: an iterable range of page numbers (0-indexed!). If None, then the whole page range is used.
Returns

yields each page in the range.

Inherited Members
pypdfium2._helpers.document.PdfDocument
formenv
parent
new
init_forms
get_formtype
get_pagemode
is_tagged
save
get_identifier
get_version
get_metadata_value
METADATA_KEYS
get_metadata_dict
count_attachments
get_attachment
new_attachment
del_attachment
get_page
new_page
del_page
import_pages
get_page_size
get_page_label
page_as_xobject
get_toc
render
pypdfium2.internal.bases.AutoCloseable
close
class Page(pypdfium2._helpers.page.PdfPage):
 30class Page(pp.PdfPage):
 31    """
 32    This class provides low-level access to graphics and characters of the page.
 33    It also fixes missing bounding boxes for rotates characters on page load,
 34    as well as allow searching for characters in an area instead of just text.
 35    """
 36
 37    def __init__(self, document: "modm_data.pdf.Document", index: int):  # noqa: F821
 38        """
 39        :param document: a PDF document.
 40        :param index: 0-index page number.
 41        """
 42        self.index = index
 43        """0-index page number."""
 44        self.number = index + 1
 45        """1-index page number."""
 46
 47        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
 48        self._links = None
 49        self._weblinks = None
 50        self._linked = False
 51
 52        _LOGGER.debug(f"Loading: {index}")
 53
 54        self._text = self.get_textpage()
 55        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
 56        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
 57        # close them in reverse order
 58        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
 59        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
 60
 61        self._fix_bboxes()
 62
 63    @cached_property
 64    def label(self) -> str:
 65        """The page label."""
 66        return self.pdf.get_page_label(self.index)
 67
 68    @cached_property
 69    def width(self) -> float:
 70        """The page width."""
 71        return self.get_width()
 72
 73    @cached_property
 74    def height(self) -> float:
 75        """The page height."""
 76        return self.get_height()
 77
 78    @cached_property
 79    def rotation(self) -> int:
 80        """The page rotation in degrees."""
 81        return self.get_rotation()
 82
 83    @cached_property
 84    def bbox(self) -> Rectangle:
 85        """The page bounding box."""
 86        return Rectangle(*self.get_bbox())
 87
 88    @cached_property
 89    def char_count(self) -> int:
 90        """The total count of characters."""
 91        return self._text.count_chars()
 92
 93    @cache
 94    def char(self, index: int) -> Character:
 95        """:return: The character at the 0-index."""
 96        return Character(self, index)
 97
 98    @property
 99    def chars(self) -> Iterator[Character]:
100        """Yields all characters."""
101        for ii in range(self.char_count):
102            yield self.char(ii)
103
104    @cached_property
105    def objlinks(self) -> list[ObjLink]:
106        """All object links."""
107        links = []
108        pos = ctypes.c_int(0)
109        link = pp.raw.FPDF_LINK()
110        while pp.raw.FPDFLink_Enumerate(self, pos, link):
111            links.append(ObjLink(self, link))
112        return links
113
114    @cached_property
115    def weblinks(self) -> list[WebLink]:
116        """All web links."""
117        links = []
118        for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)):
119            links.append(WebLink(self, ii))
120        return links
121
122    def chars_in_area(self, area: Rectangle) -> list[Character]:
123        """
124        :param area: area to search for character in.
125        :return: All characters found in the area.
126        """
127        found = []
128        # We perform binary searches of the lower and upper y-positions first
129        # lines are ordered by y-position
130        ypositions = list(self._charlines.keys())
131        y_bottom = bisect_left(ypositions, area.bottom)
132        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
133
134        # Then for every line we do another binary search for left and right
135        for ypos in ypositions[y_bottom:y_top]:
136            chars = self._charlines[ypos]
137            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
138            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
139            # Finally we add all these characters
140            found.extend(chars[x_left:x_right])
141        return found
142
143    def text_in_area(self, area: Rectangle) -> str:
144        """
145        :param area: area to search for text in.
146        :return: Only the text found in the area.
147        """
148        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
149
150    @property
151    def structures(self) -> Iterator[Structure]:
152        """The PDF/UA tags."""
153        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
154        for ii in range(count):
155            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
156            yield Structure(self, child)
157
158    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
159        """
160        Searches for a match string as whole, consecutive words and yields the
161        characters.
162
163        :param string: The search string.
164        :param case_sensitive: Ignore case if false.
165        :return: yields the characters found.
166        """
167        searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True)
168        while idx := searcher.get_next():
169            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
170            yield chars
171
172    @cached_property
173    def paths(self) -> list[Path]:
174        """All paths."""
175        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
176
177    @cached_property
178    def images(self) -> list[Image]:
179        """All images."""
180        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
181
182    def graphic_clusters(
183        self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None
184    ) -> list[tuple[Rectangle, list[Path]]]:
185        if absolute_tolerance is None:
186            absolute_tolerance = min(self.width, self.height) * 0.01
187
188        # First collect all vertical regions
189        filtered_paths = []
190        for path in self.paths:
191            if predicate is None or predicate(path):
192                filtered_paths.append(path)
193        for image in self.images:
194            if predicate is None or predicate(image):
195                filtered_paths.append(image)
196
197        regions = []
198        for path in sorted(filtered_paths, key=lambda path: path.bbox.y):
199            for reg in regions:
200                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
201                    # They overlap, so merge them
202                    reg.v0 = min(reg.v0, path.bbox.bottom)
203                    reg.v1 = max(reg.v1, path.bbox.top)
204                    reg.objs.append(path)
205                    break
206            else:
207                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
208
209        # Now collect horizontal region inside each vertical region
210        for yreg in regions:
211            for path in sorted(filtered_paths, key=lambda path: path.bbox.x):
212                # check if horizontal line is contained in vregion
213                if yreg.contains(path.bbox.y, absolute_tolerance):
214                    for xreg in yreg.subregions:
215                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
216                            # They overlap so merge them
217                            xreg.v0 = min(xreg.v0, path.bbox.left)
218                            xreg.v1 = max(xreg.v1, path.bbox.right)
219                            xreg.objs.append(path)
220                            break
221                    else:
222                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
223
224        clusters = []
225        for yreg in regions:
226            for xreg in yreg.subregions:
227                if len(yreg.subregions) > 1:
228                    # Strip down the height again for subregions
229                    y0, y1 = 1e9, 0
230                    for path in xreg.objs:
231                        y0 = min(y0, path.bbox.bottom)
232                        y1 = max(y1, path.bbox.top)
233                else:
234                    y0, y1 = yreg.v0, yreg.v1
235                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
236                clusters.append((bbox, xreg.objs))
237
238        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
239
240    def _link_characters(self):
241        if self._linked:
242            return
243        # The in-document links only gives us rectangles and we must find the
244        # linked chars ourselves
245        for link in self.objlinks:
246            for char in self.chars_in_area(link.bbox):
247                char.objlink = link
248        # The weblinks give you an explicit char range, very convenient
249        for link in self.weblinks:
250            for ii in range(*link.range):
251                self.char(ii).weblink = link
252        self._linked = True
253
254    @cached_property
255    def _charlines(self):
256        charlines = defaultdict(list)
257        for char in self.chars:
258            charlines[round(char.bbox.midpoint.y, 1)].append(char)
259
260        orderedchars = OrderedDict.fromkeys(sorted(charlines))
261        for ypos, chars in charlines.items():
262            orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x)
263
264        return orderedchars
265
266    def _fix_bboxes(self):
267        def _key(char):
268            height = round(char.tbbox.height, 1)
269            width = round(char.tbbox.width, 1)
270            return f"{char.font} {char.unicode} {height} {width}"
271
272        fix_chars = []
273        for char in self.chars:
274            if not char._bbox.width or not char._bbox.height:
275                if char._rotation:
276                    fix_chars.append(char)
277                elif char.unicode not in {0xA, 0xD}:
278                    fix_chars.append(char)
279            elif char.unicode not in {0xA, 0xD} and not char._rotation and _key(char) not in self.pdf._bbox_cache:
280                bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation)
281                self.pdf._bbox_cache[_key(char)] = (char, bbox)
282                # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation)
283        for char in fix_chars:
284            bbox = self.pdf._bbox_cache.get(_key(char))
285            if bbox is not None:
286                # print("<-", char.descr(), char._rotation, char.rotation, char.height)
287                _, bbox = bbox
288                bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin)
289                char._bbox = bbox
290            elif char.unicode not in {0x20, 0xA, 0xD}:
291                _LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")

This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.

Page(document: Document, index: int)
37    def __init__(self, document: "modm_data.pdf.Document", index: int):  # noqa: F821
38        """
39        :param document: a PDF document.
40        :param index: 0-index page number.
41        """
42        self.index = index
43        """0-index page number."""
44        self.number = index + 1
45        """1-index page number."""
46
47        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
48        self._links = None
49        self._weblinks = None
50        self._linked = False
51
52        _LOGGER.debug(f"Loading: {index}")
53
54        self._text = self.get_textpage()
55        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
56        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
57        # close them in reverse order
58        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
59        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
60
61        self._fix_bboxes()
Parameters
  • document: a PDF document.
  • index: 0-index page number.
index

0-index page number.

number

1-index page number.

label: str
63    @cached_property
64    def label(self) -> str:
65        """The page label."""
66        return self.pdf.get_page_label(self.index)

The page label.

width: float
68    @cached_property
69    def width(self) -> float:
70        """The page width."""
71        return self.get_width()

The page width.

height: float
73    @cached_property
74    def height(self) -> float:
75        """The page height."""
76        return self.get_height()

The page height.

rotation: int
78    @cached_property
79    def rotation(self) -> int:
80        """The page rotation in degrees."""
81        return self.get_rotation()

The page rotation in degrees.

bbox: modm_data.utils.Rectangle
83    @cached_property
84    def bbox(self) -> Rectangle:
85        """The page bounding box."""
86        return Rectangle(*self.get_bbox())

The page bounding box.

char_count: int
88    @cached_property
89    def char_count(self) -> int:
90        """The total count of characters."""
91        return self._text.count_chars()

The total count of characters.

@cache
def char(self, index: int) -> Character:
93    @cache
94    def char(self, index: int) -> Character:
95        """:return: The character at the 0-index."""
96        return Character(self, index)
Returns

The character at the 0-index.

chars: Iterator[Character]
 98    @property
 99    def chars(self) -> Iterator[Character]:
100        """Yields all characters."""
101        for ii in range(self.char_count):
102            yield self.char(ii)

Yields all characters.

def chars_in_area( self, area: modm_data.utils.Rectangle) -> list[Character]:
122    def chars_in_area(self, area: Rectangle) -> list[Character]:
123        """
124        :param area: area to search for character in.
125        :return: All characters found in the area.
126        """
127        found = []
128        # We perform binary searches of the lower and upper y-positions first
129        # lines are ordered by y-position
130        ypositions = list(self._charlines.keys())
131        y_bottom = bisect_left(ypositions, area.bottom)
132        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
133
134        # Then for every line we do another binary search for left and right
135        for ypos in ypositions[y_bottom:y_top]:
136            chars = self._charlines[ypos]
137            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
138            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
139            # Finally we add all these characters
140            found.extend(chars[x_left:x_right])
141        return found
Parameters
  • area: area to search for character in.
Returns

All characters found in the area.

def text_in_area(self, area: modm_data.utils.Rectangle) -> str:
143    def text_in_area(self, area: Rectangle) -> str:
144        """
145        :param area: area to search for text in.
146        :return: Only the text found in the area.
147        """
148        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
Parameters
  • area: area to search for text in.
Returns

Only the text found in the area.

structures: Iterator[Structure]
150    @property
151    def structures(self) -> Iterator[Structure]:
152        """The PDF/UA tags."""
153        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
154        for ii in range(count):
155            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
156            yield Structure(self, child)

The PDF/UA tags.

def find( self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
158    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
159        """
160        Searches for a match string as whole, consecutive words and yields the
161        characters.
162
163        :param string: The search string.
164        :param case_sensitive: Ignore case if false.
165        :return: yields the characters found.
166        """
167        searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True)
168        while idx := searcher.get_next():
169            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
170            yield chars

Searches for a match string as whole, consecutive words and yields the characters.

Parameters
  • string: The search string.
  • case_sensitive: Ignore case if false.
Returns

yields the characters found.

paths: list[Path]
172    @cached_property
173    def paths(self) -> list[Path]:
174        """All paths."""
175        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]

All paths.

images: list[Image]
177    @cached_property
178    def images(self) -> list[Image]:
179        """All images."""
180        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]

All images.

def graphic_clusters( self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None) -> list[tuple[modm_data.utils.Rectangle, list[Path]]]:
182    def graphic_clusters(
183        self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None
184    ) -> list[tuple[Rectangle, list[Path]]]:
185        if absolute_tolerance is None:
186            absolute_tolerance = min(self.width, self.height) * 0.01
187
188        # First collect all vertical regions
189        filtered_paths = []
190        for path in self.paths:
191            if predicate is None or predicate(path):
192                filtered_paths.append(path)
193        for image in self.images:
194            if predicate is None or predicate(image):
195                filtered_paths.append(image)
196
197        regions = []
198        for path in sorted(filtered_paths, key=lambda path: path.bbox.y):
199            for reg in regions:
200                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
201                    # They overlap, so merge them
202                    reg.v0 = min(reg.v0, path.bbox.bottom)
203                    reg.v1 = max(reg.v1, path.bbox.top)
204                    reg.objs.append(path)
205                    break
206            else:
207                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
208
209        # Now collect horizontal region inside each vertical region
210        for yreg in regions:
211            for path in sorted(filtered_paths, key=lambda path: path.bbox.x):
212                # check if horizontal line is contained in vregion
213                if yreg.contains(path.bbox.y, absolute_tolerance):
214                    for xreg in yreg.subregions:
215                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
216                            # They overlap so merge them
217                            xreg.v0 = min(xreg.v0, path.bbox.left)
218                            xreg.v1 = max(xreg.v1, path.bbox.right)
219                            xreg.objs.append(path)
220                            break
221                    else:
222                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
223
224        clusters = []
225        for yreg in regions:
226            for xreg in yreg.subregions:
227                if len(yreg.subregions) > 1:
228                    # Strip down the height again for subregions
229                    y0, y1 = 1e9, 0
230                    for path in xreg.objs:
231                        y0 = min(y0, path.bbox.bottom)
232                        y1 = max(y1, path.bbox.top)
233                else:
234                    y0, y1 = yreg.v0, yreg.v1
235                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
236                clusters.append((bbox, xreg.objs))
237
238        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
Inherited Members
pypdfium2._helpers.page.PdfPage
parent
get_width
get_height
get_size
get_rotation
set_rotation
get_mediabox
set_mediabox
get_cropbox
set_cropbox
get_bleedbox
set_bleedbox
get_trimbox
set_trimbox
get_artbox
set_artbox
get_bbox
get_textpage
insert_obj
remove_obj
gen_content
get_objects
render
pypdfium2.internal.bases.AutoCloseable
close
class Character:
 28class Character:
 29    """
 30    This class contains all information about a single character in the PDF
 31    page.
 32    """
 33
 34    class RenderMode(Enum):
 35        """Tells the PDF viewer how to render this character glyph."""
 36
 37        UNKNOWN = -1
 38        FILL = 0
 39        STROKE = 1
 40        FILL_STROKE = 2
 41        INVISIBLE = 3
 42        FILL_CLIP = 4
 43        STROKE_CLIP = 5
 44        FILL_STROKE_CLIP = 6
 45        CLIP = 7
 46
 47    def __init__(self, page: "modm_data.pdf.page.Page", index: int):  # noqa: F821
 48        """
 49        :param page: The page containing the character.
 50        :param index: The index of the character.
 51        """
 52        self._page = page
 53        self._text = page._text
 54        self._index = index
 55        self._font = None
 56        self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index)))
 57
 58        self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index)
 59        """The unicode value of the character."""
 60        self.objlink: "modm_data.pdf.link.ObjLink" = None  # noqa: F821
 61        """The object link of this character or `None`"""
 62        self.weblink: "modm_data.pdf.link.WebLink" = None  # noqa: F821
 63        """The web link of this character or `None`"""
 64
 65        bbox = Rectangle(*self._text.get_charbox(self._index, loose=True))
 66        if self._page.rotation:
 67            bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x)
 68        self._bbox = bbox
 69
 70    def _font_flags(self) -> tuple[str, int]:
 71        if self._font is None:
 72            font = ctypes.create_string_buffer(255)
 73            flags = ctypes.c_int()
 74            pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags)
 75            self._font = (font.value.decode("utf-8"), flags.value)
 76        return self._font
 77
 78    @property
 79    def char(self) -> str:
 80        """The printable string of the unicode value."""
 81        char = chr(self.unicode)
 82        return char if char.isprintable() else ""
 83
 84    @cached_property
 85    def origin(self) -> Point:
 86        """The origin of the character."""
 87        x, y = ctypes.c_double(), ctypes.c_double()
 88        assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y)
 89        if self._page.rotation:
 90            return Point(y.value, self._page.height - x.value)
 91        return Point(x.value, y.value)
 92
 93    @cached_property
 94    def width(self) -> float:
 95        """The width of the character's bounding box."""
 96        if self.rotation:
 97            return self.bbox.height
 98        return self.bbox.width
 99
100    @cached_property
101    def height(self) -> float:
102        """The height of the character's bounding box."""
103        if self.rotation:
104            return self.bbox.width
105        return self.bbox.height
106
107    @cached_property
108    def tbbox(self) -> Rectangle:
109        """The tight bounding box of the character."""
110        tbbox = Rectangle(*self._text.get_charbox(self._index))
111        if self._page.rotation:
112            tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x)
113        return tbbox
114
115    @property
116    def bbox(self) -> Rectangle:
117        """
118        The loose bounding box of the character.
119        .. note::
120            If the loose bounding box is not available, the tight bounding box
121            is used instead.
122        """
123        if not self._bbox.width or not self._bbox.height:
124            return self.tbbox
125        return self._bbox
126
127    @cached_property
128    def twidth(self) -> float:
129        """The width of the character's tight bounding box."""
130        return self.tbbox.width
131
132    @cached_property
133    def theight(self) -> float:
134        """The height of the character's tight bounding box."""
135        return self.tbbox.height
136
137    @cached_property
138    def render_mode(self) -> RenderMode:
139        """The render mode of the character."""
140        return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))
141
142    @cached_property
143    def rotation(self) -> int:
144        """The rotation of the character in degrees modulo 360."""
145        # Special case for vertical text in rotated pages
146        if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}:
147            return 90
148        if self._page.rotation and self._rotation:
149            return (self._page.rotation + self._rotation) % 360
150        return self._rotation
151
152    @cached_property
153    def size(self) -> float:
154        """The font size of the character."""
155        return pp.raw.FPDFText_GetFontSize(self._text, self._index)
156
157    @cached_property
158    def weight(self) -> int:
159        """The font weight of the character."""
160        return pp.raw.FPDFText_GetFontWeight(self._text, self._index)
161
162    @cached_property
163    def fill(self) -> int:
164        """The fill color of the character."""
165        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
166        pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a)
167        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
168
169    @cached_property
170    def stroke(self) -> int:
171        """The stroke color of the character."""
172        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
173        pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a)
174        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
175
176    @cached_property
177    def font(self) -> str:
178        """The font name of the character."""
179        return self._font_flags()[0]
180
181    @cached_property
182    def flags(self) -> int:
183        """The font flags of the character."""
184        return self._font_flags()[1]
185
186    def descr(self) -> str:
187        """Human-readable description of the character for debugging."""
188        char = chr(self.unicode)
189        if not char.isprintable():
190            char = hex(self.unicode)
191        return (
192            f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, "
193            f"{self.render_mode}, {self.font}, {hex(self.flags)}, "
194            f"{self.fill}, {self.stroke}, {repr(self.bbox)})"
195        )
196
197    def __str__(self) -> str:
198        return self.char
199
200    def __repr__(self) -> str:
201        char = chr(self.unicode)
202        escape = {0xA: "\\n", 0xD: "\\r", 0x9: "\\t", 0x20: "␣"}
203        char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode))
204        return char

This class contains all information about a single character in the PDF page.

Character(page: Page, index: int)
47    def __init__(self, page: "modm_data.pdf.page.Page", index: int):  # noqa: F821
48        """
49        :param page: The page containing the character.
50        :param index: The index of the character.
51        """
52        self._page = page
53        self._text = page._text
54        self._index = index
55        self._font = None
56        self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index)))
57
58        self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index)
59        """The unicode value of the character."""
60        self.objlink: "modm_data.pdf.link.ObjLink" = None  # noqa: F821
61        """The object link of this character or `None`"""
62        self.weblink: "modm_data.pdf.link.WebLink" = None  # noqa: F821
63        """The web link of this character or `None`"""
64
65        bbox = Rectangle(*self._text.get_charbox(self._index, loose=True))
66        if self._page.rotation:
67            bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x)
68        self._bbox = bbox
Parameters
  • page: The page containing the character.
  • index: The index of the character.
unicode: int

The unicode value of the character.

char: str
78    @property
79    def char(self) -> str:
80        """The printable string of the unicode value."""
81        char = chr(self.unicode)
82        return char if char.isprintable() else ""

The printable string of the unicode value.

origin: modm_data.utils.Point
84    @cached_property
85    def origin(self) -> Point:
86        """The origin of the character."""
87        x, y = ctypes.c_double(), ctypes.c_double()
88        assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y)
89        if self._page.rotation:
90            return Point(y.value, self._page.height - x.value)
91        return Point(x.value, y.value)

The origin of the character.

width: float
93    @cached_property
94    def width(self) -> float:
95        """The width of the character's bounding box."""
96        if self.rotation:
97            return self.bbox.height
98        return self.bbox.width

The width of the character's bounding box.

height: float
100    @cached_property
101    def height(self) -> float:
102        """The height of the character's bounding box."""
103        if self.rotation:
104            return self.bbox.width
105        return self.bbox.height

The height of the character's bounding box.

tbbox: modm_data.utils.Rectangle
107    @cached_property
108    def tbbox(self) -> Rectangle:
109        """The tight bounding box of the character."""
110        tbbox = Rectangle(*self._text.get_charbox(self._index))
111        if self._page.rotation:
112            tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x)
113        return tbbox

The tight bounding box of the character.

bbox: modm_data.utils.Rectangle
115    @property
116    def bbox(self) -> Rectangle:
117        """
118        The loose bounding box of the character.
119        .. note::
120            If the loose bounding box is not available, the tight bounding box
121            is used instead.
122        """
123        if not self._bbox.width or not self._bbox.height:
124            return self.tbbox
125        return self._bbox

The loose bounding box of the character.

If the loose bounding box is not available, the tight bounding box is used instead.

twidth: float
127    @cached_property
128    def twidth(self) -> float:
129        """The width of the character's tight bounding box."""
130        return self.tbbox.width

The width of the character's tight bounding box.

theight: float
132    @cached_property
133    def theight(self) -> float:
134        """The height of the character's tight bounding box."""
135        return self.tbbox.height

The height of the character's tight bounding box.

render_mode: Character.RenderMode
137    @cached_property
138    def render_mode(self) -> RenderMode:
139        """The render mode of the character."""
140        return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))

The render mode of the character.

rotation: int
142    @cached_property
143    def rotation(self) -> int:
144        """The rotation of the character in degrees modulo 360."""
145        # Special case for vertical text in rotated pages
146        if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}:
147            return 90
148        if self._page.rotation and self._rotation:
149            return (self._page.rotation + self._rotation) % 360
150        return self._rotation

The rotation of the character in degrees modulo 360.

size: float
152    @cached_property
153    def size(self) -> float:
154        """The font size of the character."""
155        return pp.raw.FPDFText_GetFontSize(self._text, self._index)

The font size of the character.

weight: int
157    @cached_property
158    def weight(self) -> int:
159        """The font weight of the character."""
160        return pp.raw.FPDFText_GetFontWeight(self._text, self._index)

The font weight of the character.

fill: int
162    @cached_property
163    def fill(self) -> int:
164        """The fill color of the character."""
165        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
166        pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a)
167        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The fill color of the character.

stroke: int
169    @cached_property
170    def stroke(self) -> int:
171        """The stroke color of the character."""
172        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
173        pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a)
174        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The stroke color of the character.

font: str
176    @cached_property
177    def font(self) -> str:
178        """The font name of the character."""
179        return self._font_flags()[0]

The font name of the character.

flags: int
181    @cached_property
182    def flags(self) -> int:
183        """The font flags of the character."""
184        return self._font_flags()[1]

The font flags of the character.

def descr(self) -> str:
186    def descr(self) -> str:
187        """Human-readable description of the character for debugging."""
188        char = chr(self.unicode)
189        if not char.isprintable():
190            char = hex(self.unicode)
191        return (
192            f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, "
193            f"{self.render_mode}, {self.font}, {hex(self.flags)}, "
194            f"{self.fill}, {self.stroke}, {repr(self.bbox)})"
195        )

Human-readable description of the character for debugging.

class Character.RenderMode(enum.Enum):
34    class RenderMode(Enum):
35        """Tells the PDF viewer how to render this character glyph."""
36
37        UNKNOWN = -1
38        FILL = 0
39        STROKE = 1
40        FILL_STROKE = 2
41        INVISIBLE = 3
42        FILL_CLIP = 4
43        STROKE_CLIP = 5
44        FILL_STROKE_CLIP = 6
45        CLIP = 7

Tells the PDF viewer how to render this character glyph.

UNKNOWN = <RenderMode.UNKNOWN: -1>
FILL = <RenderMode.FILL: 0>
STROKE = <RenderMode.STROKE: 1>
FILL_STROKE = <RenderMode.FILL_STROKE: 2>
INVISIBLE = <RenderMode.INVISIBLE: 3>
FILL_CLIP = <RenderMode.FILL_CLIP: 4>
STROKE_CLIP = <RenderMode.STROKE_CLIP: 5>
FILL_STROKE_CLIP = <RenderMode.FILL_STROKE_CLIP: 6>
CLIP = <RenderMode.CLIP: 7>
Inherited Members
enum.Enum
name
value
class Path(pypdfium2._helpers.pageobjects.PdfObject):
 20class Path(pp.PdfObject):
 21    """
 22    This class specializes `pypdfium2.PdfObject` to add accessors for  graphics
 23    containing vector paths of various configurations.
 24
 25    You must construct the paths by calling `modm_data.pdf.page.Page.paths`.
 26    """
 27
 28    class Type(Enum):
 29        """Path Type"""
 30
 31        LINE = 0
 32        BEZIER = 1
 33        MOVE = 2
 34
 35    class Cap(Enum):
 36        """Path Cap Type"""
 37
 38        BUTT = 0
 39        ROUND = 1
 40        PROJECTING_SQUARE = 2
 41
 42    class Join(Enum):
 43        """Path Join Type"""
 44
 45        MITER = 0
 46        ROUND = 1
 47        BEVEL = 2
 48
 49    # Overwrite the PdfPageObject.__new__ function
 50    def __new__(cls, *args, **kwargs):
 51        return object.__new__(cls)
 52
 53    def __init__(self, obj):
 54        """
 55        :param obj: PDF object of the path.
 56        """
 57        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
 58        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH
 59        self.type = pp.raw.FPDF_PAGEOBJ_PATH
 60
 61    @cached_property
 62    def matrix(self) -> pp.PdfMatrix:
 63        """The transformation matrix."""
 64        return self.get_matrix()
 65
 66    @cached_property
 67    def count(self) -> int:
 68        """Number of segments in this path."""
 69        return pp.raw.FPDFPath_CountSegments(self)
 70
 71    @cached_property
 72    def fill(self) -> int:
 73        """The fill color encoded as 32-bit RGBA."""
 74        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
 75        assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a)
 76        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
 77
 78    @cached_property
 79    def stroke(self) -> int:
 80        """The stroke color encoded as 32-bit RGBA."""
 81        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
 82        assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a)
 83        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
 84
 85    @cached_property
 86    def width(self) -> float:
 87        """The stroke width."""
 88        width = ctypes.c_float()
 89        assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width)
 90        return width.value
 91
 92    @cached_property
 93    def cap(self) -> Cap:
 94        """Line cap type."""
 95        return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self))
 96
 97    @cached_property
 98    def join(self) -> Join:
 99        """Line join type."""
100        return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self))
101
102    @cached_property
103    def bbox(self) -> Rectangle:
104        """
105        Bounding box of the path.
106        .. warning::
107            The bounding is only approximated using the control points!
108            Therefore bezier curves will likely have a larger bounding box.
109        """
110        left, bottom = ctypes.c_float(), ctypes.c_float()
111        right, top = ctypes.c_float(), ctypes.c_float()
112        assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top)
113        bbox = Rectangle(left.value, bottom.value, right.value, top.value)
114        if self.page.rotation:
115            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
116        return bbox
117
118    @cached_property
119    def points(self) -> list[Point]:
120        """
121        List of points of the path. If the path is closed, the first point is
122        added to the end of the list.
123        """
124        points = []
125        for ii in range(self.count):
126            seg = pp.raw.FPDFPath_GetPathSegment(self, ii)
127            ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg))
128            # The first point should always be MOVETO
129            assert ii or ptype == Path.Type.MOVE
130
131            x, y = ctypes.c_float(), ctypes.c_float()
132            assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y)
133            x, y = self.matrix.on_point(x.value, y.value)
134            points.append(Point(x, y, type=ptype))
135
136            if pp.raw.FPDFPathSegment_GetClose(seg):
137                points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE))
138
139        if self.page.rotation:
140            points = [Point(y, self.page.height - x, type=p.type) for p in points]
141        return points
142
143    @cached_property
144    def lines(self) -> list[Line]:
145        """List of lines between the path points."""
146        points = self.points
147        return [
148            Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type)
149            for ii in range(len(points) - 1)
150        ]
151
152    def __repr__(self) -> str:
153        points = ",".join(repr(p) for p in self.points)
154        return f"P{self.count}={points}"

This class specializes pypdfium2.PdfObject to add accessors for graphics containing vector paths of various configurations.

You must construct the paths by calling modm_data.pdf.page.Page.paths.

Path(obj)
53    def __init__(self, obj):
54        """
55        :param obj: PDF object of the path.
56        """
57        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
58        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH
59        self.type = pp.raw.FPDF_PAGEOBJ_PATH
Parameters
  • obj: PDF object of the path.
type
matrix: pypdfium2._helpers.matrix.PdfMatrix
61    @cached_property
62    def matrix(self) -> pp.PdfMatrix:
63        """The transformation matrix."""
64        return self.get_matrix()

The transformation matrix.

count: int
66    @cached_property
67    def count(self) -> int:
68        """Number of segments in this path."""
69        return pp.raw.FPDFPath_CountSegments(self)

Number of segments in this path.

fill: int
71    @cached_property
72    def fill(self) -> int:
73        """The fill color encoded as 32-bit RGBA."""
74        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
75        assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a)
76        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The fill color encoded as 32-bit RGBA.

stroke: int
78    @cached_property
79    def stroke(self) -> int:
80        """The stroke color encoded as 32-bit RGBA."""
81        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
82        assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a)
83        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The stroke color encoded as 32-bit RGBA.

width: float
85    @cached_property
86    def width(self) -> float:
87        """The stroke width."""
88        width = ctypes.c_float()
89        assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width)
90        return width.value

The stroke width.

cap: Path.Cap
92    @cached_property
93    def cap(self) -> Cap:
94        """Line cap type."""
95        return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self))

Line cap type.

join: Path.Join
 97    @cached_property
 98    def join(self) -> Join:
 99        """Line join type."""
100        return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self))

Line join type.

bbox: modm_data.utils.Rectangle
102    @cached_property
103    def bbox(self) -> Rectangle:
104        """
105        Bounding box of the path.
106        .. warning::
107            The bounding is only approximated using the control points!
108            Therefore bezier curves will likely have a larger bounding box.
109        """
110        left, bottom = ctypes.c_float(), ctypes.c_float()
111        right, top = ctypes.c_float(), ctypes.c_float()
112        assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top)
113        bbox = Rectangle(left.value, bottom.value, right.value, top.value)
114        if self.page.rotation:
115            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
116        return bbox

Bounding box of the path.

The bounding is only approximated using the control points! Therefore bezier curves will likely have a larger bounding box.

points: list[modm_data.utils.Point]
118    @cached_property
119    def points(self) -> list[Point]:
120        """
121        List of points of the path. If the path is closed, the first point is
122        added to the end of the list.
123        """
124        points = []
125        for ii in range(self.count):
126            seg = pp.raw.FPDFPath_GetPathSegment(self, ii)
127            ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg))
128            # The first point should always be MOVETO
129            assert ii or ptype == Path.Type.MOVE
130
131            x, y = ctypes.c_float(), ctypes.c_float()
132            assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y)
133            x, y = self.matrix.on_point(x.value, y.value)
134            points.append(Point(x, y, type=ptype))
135
136            if pp.raw.FPDFPathSegment_GetClose(seg):
137                points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE))
138
139        if self.page.rotation:
140            points = [Point(y, self.page.height - x, type=p.type) for p in points]
141        return points

List of points of the path. If the path is closed, the first point is added to the end of the list.

lines: list[modm_data.utils.Line]
143    @cached_property
144    def lines(self) -> list[Line]:
145        """List of lines between the path points."""
146        points = self.points
147        return [
148            Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type)
149            for ii in range(len(points) - 1)
150        ]

List of lines between the path points.

Inherited Members
pypdfium2._helpers.pageobjects.PdfObject
parent
get_pos
get_matrix
set_matrix
transform
pypdfium2.internal.bases.AutoCloseable
close
class Path.Type(enum.Enum):
28    class Type(Enum):
29        """Path Type"""
30
31        LINE = 0
32        BEZIER = 1
33        MOVE = 2

Path Type

LINE = <Type.LINE: 0>
BEZIER = <Type.BEZIER: 1>
MOVE = <Type.MOVE: 2>
Inherited Members
enum.Enum
name
value
class Path.Cap(enum.Enum):
35    class Cap(Enum):
36        """Path Cap Type"""
37
38        BUTT = 0
39        ROUND = 1
40        PROJECTING_SQUARE = 2

Path Cap Type

BUTT = <Cap.BUTT: 0>
ROUND = <Cap.ROUND: 1>
PROJECTING_SQUARE = <Cap.PROJECTING_SQUARE: 2>
Inherited Members
enum.Enum
name
value
class Path.Join(enum.Enum):
42    class Join(Enum):
43        """Path Join Type"""
44
45        MITER = 0
46        ROUND = 1
47        BEVEL = 2

Path Join Type

MITER = <Join.MITER: 0>
ROUND = <Join.ROUND: 1>
BEVEL = <Join.BEVEL: 2>
Inherited Members
enum.Enum
name
value
class Image(pypdfium2._helpers.pageobjects.PdfImage):
16class Image(pp.PdfImage):
17    """
18    This class extends `pypdfium2.PdfImage` to align it with the interface of
19    the `Path` class so that it can be used in the same
20    algorithms without filtering.
21
22    You must construct the images by calling `modm_data.pdf.page.Page.images`.
23
24    .. note:: Images are currently ignored.
25    """
26
27    # Overwrite the PdfPageObject.__new__ function
28    def __new__(cls, *args, **kwargs):
29        return object.__new__(cls)
30
31    def __init__(self, obj):
32        """
33        :param obj: Page object of the image.
34        """
35        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
36        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
37        self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
38
39        self.count: int = 4
40        """Number of segments. Always 4 due to rectangular image form.
41           (For compatibility with `Path.count`.)"""
42        self.stroke: int = 0
43        """The border stroke color. Always 0.
44           (For compatibility with `Path.stroke`.)"""
45        self.fill: int = 0
46        """The image fill color. Always 0.
47           (For compatibility with `Path.fill`.)"""
48        self.width: float = 0
49        """The border line width. Always 0.
50           (For compatibility with `Path.width`.)"""
51
52    @cached_property
53    def matrix(self) -> pp.PdfMatrix:
54        """The transformation matrix."""
55        return self.get_matrix()
56
57    @cached_property
58    def bbox(self) -> Rectangle:
59        """The bounding box of the image."""
60        bbox = Rectangle(*self.get_pos())
61        if self.page.rotation:
62            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
63        return bbox
64
65    @cached_property
66    def points(self) -> list[Point]:
67        """
68        The 4 points of the bounding box.
69        (For compatibility with `Path.points`.)
70        """
71        points = self.bbox.points
72        if self.page.rotation:
73            points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
74        return points
75
76    @cached_property
77    def lines(self) -> list[Line]:
78        """
79        The 4 lines of the bounding box.
80        (For compatibility with `Path.lines`.)
81        """
82        p = self.points
83        return [
84            Line(p[0], p[1], p[1].type, 0),
85            Line(p[1], p[2], p[2].type, 0),
86            Line(p[2], p[3], p[3].type, 0),
87            Line(p[3], p[0], p[0].type, 0),
88        ]
89
90    def __repr__(self) -> str:
91        return f"I{self.bbox}"

This class extends pypdfium2.PdfImage to align it with the interface of the Path class so that it can be used in the same algorithms without filtering.

You must construct the images by calling modm_data.pdf.page.Page.images.

Images are currently ignored.
Image(obj)
31    def __init__(self, obj):
32        """
33        :param obj: Page object of the image.
34        """
35        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
36        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
37        self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
38
39        self.count: int = 4
40        """Number of segments. Always 4 due to rectangular image form.
41           (For compatibility with `Path.count`.)"""
42        self.stroke: int = 0
43        """The border stroke color. Always 0.
44           (For compatibility with `Path.stroke`.)"""
45        self.fill: int = 0
46        """The image fill color. Always 0.
47           (For compatibility with `Path.fill`.)"""
48        self.width: float = 0
49        """The border line width. Always 0.
50           (For compatibility with `Path.width`.)"""
Parameters
  • obj: Page object of the image.
type
count: int

Number of segments. Always 4 due to rectangular image form. (For compatibility with Path.count.)

stroke: int

The border stroke color. Always 0. (For compatibility with Path.stroke.)

fill: int

The image fill color. Always 0. (For compatibility with Path.fill.)

width: float

The border line width. Always 0. (For compatibility with Path.width.)

matrix: pypdfium2._helpers.matrix.PdfMatrix
52    @cached_property
53    def matrix(self) -> pp.PdfMatrix:
54        """The transformation matrix."""
55        return self.get_matrix()

The transformation matrix.

bbox: modm_data.utils.Rectangle
57    @cached_property
58    def bbox(self) -> Rectangle:
59        """The bounding box of the image."""
60        bbox = Rectangle(*self.get_pos())
61        if self.page.rotation:
62            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
63        return bbox

The bounding box of the image.

points: list[modm_data.utils.Point]
65    @cached_property
66    def points(self) -> list[Point]:
67        """
68        The 4 points of the bounding box.
69        (For compatibility with `Path.points`.)
70        """
71        points = self.bbox.points
72        if self.page.rotation:
73            points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
74        return points

The 4 points of the bounding box. (For compatibility with Path.points.)

lines: list[modm_data.utils.Line]
76    @cached_property
77    def lines(self) -> list[Line]:
78        """
79        The 4 lines of the bounding box.
80        (For compatibility with `Path.lines`.)
81        """
82        p = self.points
83        return [
84            Line(p[0], p[1], p[1].type, 0),
85            Line(p[1], p[2], p[2].type, 0),
86            Line(p[2], p[3], p[3].type, 0),
87            Line(p[3], p[0], p[0].type, 0),
88        ]

The 4 lines of the bounding box. (For compatibility with Path.lines.)

Inherited Members
pypdfium2._helpers.pageobjects.PdfImage
SIMPLE_FILTERS
new
get_metadata
get_size
load_jpeg
set_bitmap
get_bitmap
get_data
get_filters
extract
pypdfium2._helpers.pageobjects.PdfObject
parent
get_pos
get_matrix
set_matrix
transform
pypdfium2.internal.bases.AutoCloseable
close
class Structure:
 24class Structure:
 25    """
 26    A PDF/UA ("tagged PDF") contains the structure of content as a tree data
 27    structure with similar semantics to HTML.
 28
 29    This class is a convenience wrapper around [the pdfium structtree methods](
 30    https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h).
 31    """
 32
 33    def __init__(self, page: "modm_data.pdf.page.Page", element: pp.raw.FPDF_STRUCTELEMENT, parent: "Structure" = None):  # noqa: F821
 34        self._page = page
 35        self._element = element
 36        self.parent: Structure = weakref.ref(parent) if parent else None
 37        """The parent node."""
 38
 39    def _get_string(self, function) -> str:
 40        length = function(self._element, 0, 0)
 41        clength = ctypes.c_ulong(length)
 42        cbuffer = ctypes.create_string_buffer(length)
 43        function(self._element, cbuffer, clength)
 44        return bytes(cbuffer).decode("utf-16-le", errors="ignore")
 45
 46    @cached_property
 47    def title(self) -> str:
 48        """Title `/T`"""
 49        return self._get_string(pp.raw.FPDF_StructElement_GetTitle)
 50
 51    @cached_property
 52    def actual_text(self) -> str:
 53        """The actual text."""
 54        return self._get_string(pp.raw.FPDF_StructElement_GetActualText)
 55
 56    @cached_property
 57    def alt_text(self) -> str:
 58        """Alternate Text"""
 59        return self._get_string(pp.raw.FPDF_StructElement_GetAltText)
 60
 61    @cached_property
 62    def type(self) -> str:
 63        """Type `/S`"""
 64        return self._get_string(pp.raw.FPDF_StructElement_GetType)
 65
 66    @cached_property
 67    def obj_type(self) -> str:
 68        """Object Type `/Type`"""
 69        return self._get_string(pp.raw.FPDF_StructElement_GetObjType)
 70
 71    @cached_property
 72    def language(self) -> str:
 73        """The case-insensitive IETF BCP 47 language code."""
 74        return self._get_string(pp.raw.FPDF_StructElement_GetLang)
 75
 76    @cached_property
 77    def id(self) -> str:
 78        """Identifier"""
 79        return self._get_string(pp.raw.FPDF_StructElement_GetID)
 80
 81    @cached_property
 82    def marked_ids(self) -> list[int]:
 83        """List of marked content identifiers"""
 84        ids = []
 85        for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)):
 86            if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1:
 87                ids.append(mcid)
 88        return ids
 89
 90    @cached_property
 91    def attributes(self) -> dict[str, str | bool | float]:
 92        """
 93        All attributes of this structure element as a dictionary.
 94
 95        .. note::
 96            Due to limitations of the pdfium API, attribute arrays cannot be
 97            extracted! The values are marked as `[?]` in the dictionary.
 98        """
 99        kv = {}
100        for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)):
101            attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex)
102            for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)):
103                # Get the name
104                clength = ctypes.c_ulong(0)
105                cname = ctypes.create_string_buffer(1)  # workaround to get length
106                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength)
107                cname = ctypes.create_string_buffer(clength.value)
108                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength)
109                name = cname.raw.decode("utf-8", errors="ignore")
110
111                # Get the type
112                atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname)
113                assert atype != pp.raw.FPDF_OBJECT_UNKNOWN
114
115                # Then get each type individually
116                match atype:
117                    case pp.raw.FPDF_OBJECT_BOOLEAN:
118                        cbool = ctypes.bool()
119                        assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool)
120                        kv[name] = cbool.value
121
122                    case pp.raw.FPDF_OBJECT_NUMBER:
123                        cfloat = ctypes.c_float()
124                        assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat)
125                        kv[name] = cfloat.value
126
127                    case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME:
128                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength)
129                        cattrname = ctypes.create_string_buffer(clength.value * 2)
130                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength)
131                        kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1]
132
133                    # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed?
134                    # case pp.raw.FPDF_OBJECT_ARRAY:
135                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength)
136                    #     cblob = ctypes.create_string_buffer(clength.value)
137                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength)
138                    #     kv[name] = cblob.raw
139
140                    case pp.raw.FPDF_OBJECT_ARRAY:
141                        kv[name] = "[?]"
142
143                    case _:
144                        kv[name] = f"[unknown={atype}?]"
145        return kv
146
147    @cache
148    def child(self, index: int) -> "Structure":
149        """
150        :param index: 0-index of child.
151        :return: Child structure.
152        """
153        index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index)
154        return Structure(self._page, index, self)
155
156    @property
157    def children(self) -> list:
158        """All child structures."""
159        count = pp.raw.FPDF_StructElement_CountChildren(self._element)
160        for ii in range(count):
161            yield self.child(ii)
162
163    def descr(self, indent=0) -> str:
164        """Description including all children via indentation."""
165        string = " " * indent + repr(self) + "\n"
166        for child in self.children:
167            string += child.descr(indent + 4)
168        return string
169
170    def __repr__(self) -> str:
171        values = []
172        if self.type:
173            values.append(f"type={self.type}")
174        if self.title:
175            values.append(f"title={self.title}")
176        if self.actual_text:
177            values.append(f"act_text={self.actual_text}")
178        if self.alt_text:
179            values.append(f"alt_text={self.alt_text}")
180        if self.id:
181            values.append(f"id={self.id}")
182        values += [f"mid={i}" for i in self.marked_ids]
183        values += [f"{k}={v}" for k, v in self.attributes.items()]
184        return f"S({','.join(map(str, values))})"

A PDF/UA ("tagged PDF") contains the structure of content as a tree data structure with similar semantics to HTML.

This class is a convenience wrapper around the pdfium structtree methods.

Structure( page: Page, element: pypdfium2_raw.bindings.LP_struct_fpdf_structelement_t__, parent: Structure = None)
33    def __init__(self, page: "modm_data.pdf.page.Page", element: pp.raw.FPDF_STRUCTELEMENT, parent: "Structure" = None):  # noqa: F821
34        self._page = page
35        self._element = element
36        self.parent: Structure = weakref.ref(parent) if parent else None
37        """The parent node."""
parent: Structure

The parent node.

title: str
46    @cached_property
47    def title(self) -> str:
48        """Title `/T`"""
49        return self._get_string(pp.raw.FPDF_StructElement_GetTitle)

Title /T

actual_text: str
51    @cached_property
52    def actual_text(self) -> str:
53        """The actual text."""
54        return self._get_string(pp.raw.FPDF_StructElement_GetActualText)

The actual text.

alt_text: str
56    @cached_property
57    def alt_text(self) -> str:
58        """Alternate Text"""
59        return self._get_string(pp.raw.FPDF_StructElement_GetAltText)

Alternate Text

type: str
61    @cached_property
62    def type(self) -> str:
63        """Type `/S`"""
64        return self._get_string(pp.raw.FPDF_StructElement_GetType)

Type /S

obj_type: str
66    @cached_property
67    def obj_type(self) -> str:
68        """Object Type `/Type`"""
69        return self._get_string(pp.raw.FPDF_StructElement_GetObjType)

Object Type /Type

language: str
71    @cached_property
72    def language(self) -> str:
73        """The case-insensitive IETF BCP 47 language code."""
74        return self._get_string(pp.raw.FPDF_StructElement_GetLang)

The case-insensitive IETF BCP 47 language code.

id: str
76    @cached_property
77    def id(self) -> str:
78        """Identifier"""
79        return self._get_string(pp.raw.FPDF_StructElement_GetID)

Identifier

marked_ids: list[int]
81    @cached_property
82    def marked_ids(self) -> list[int]:
83        """List of marked content identifiers"""
84        ids = []
85        for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)):
86            if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1:
87                ids.append(mcid)
88        return ids

List of marked content identifiers

attributes: dict[str, str | bool | float]
 90    @cached_property
 91    def attributes(self) -> dict[str, str | bool | float]:
 92        """
 93        All attributes of this structure element as a dictionary.
 94
 95        .. note::
 96            Due to limitations of the pdfium API, attribute arrays cannot be
 97            extracted! The values are marked as `[?]` in the dictionary.
 98        """
 99        kv = {}
100        for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)):
101            attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex)
102            for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)):
103                # Get the name
104                clength = ctypes.c_ulong(0)
105                cname = ctypes.create_string_buffer(1)  # workaround to get length
106                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength)
107                cname = ctypes.create_string_buffer(clength.value)
108                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength)
109                name = cname.raw.decode("utf-8", errors="ignore")
110
111                # Get the type
112                atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname)
113                assert atype != pp.raw.FPDF_OBJECT_UNKNOWN
114
115                # Then get each type individually
116                match atype:
117                    case pp.raw.FPDF_OBJECT_BOOLEAN:
118                        cbool = ctypes.bool()
119                        assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool)
120                        kv[name] = cbool.value
121
122                    case pp.raw.FPDF_OBJECT_NUMBER:
123                        cfloat = ctypes.c_float()
124                        assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat)
125                        kv[name] = cfloat.value
126
127                    case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME:
128                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength)
129                        cattrname = ctypes.create_string_buffer(clength.value * 2)
130                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength)
131                        kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1]
132
133                    # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed?
134                    # case pp.raw.FPDF_OBJECT_ARRAY:
135                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength)
136                    #     cblob = ctypes.create_string_buffer(clength.value)
137                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength)
138                    #     kv[name] = cblob.raw
139
140                    case pp.raw.FPDF_OBJECT_ARRAY:
141                        kv[name] = "[?]"
142
143                    case _:
144                        kv[name] = f"[unknown={atype}?]"
145        return kv

All attributes of this structure element as a dictionary.

Due to limitations of the pdfium API, attribute arrays cannot be extracted! The values are marked as [?] in the dictionary.

@cache
def child(self, index: int) -> Structure:
147    @cache
148    def child(self, index: int) -> "Structure":
149        """
150        :param index: 0-index of child.
151        :return: Child structure.
152        """
153        index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index)
154        return Structure(self._page, index, self)
Parameters
  • index: 0-index of child.
Returns

Child structure.

children: list
156    @property
157    def children(self) -> list:
158        """All child structures."""
159        count = pp.raw.FPDF_StructElement_CountChildren(self._element)
160        for ii in range(count):
161            yield self.child(ii)

All child structures.

def descr(self, indent=0) -> str:
163    def descr(self, indent=0) -> str:
164        """Description including all children via indentation."""
165        string = " " * indent + repr(self) + "\n"
166        for child in self.children:
167            string += child.descr(indent + 4)
168        return string

Description including all children via indentation.

def render_page_pdf(doc, page, new_doc=None, index=0):
 51def render_page_pdf(doc, page, new_doc=None, index=0):
 52    _, height = page.width, page.height
 53
 54    if new_doc is None:
 55        new_doc = pp.raw.FPDF_CreateNewDocument()
 56    # copy page over to new doc
 57    assert pp.raw.FPDF_ImportPages(new_doc, doc, str(page.number).encode("ascii"), index)
 58    new_page = pp.raw.FPDF_LoadPage(new_doc, index)
 59    rotation = page.rotation
 60
 61    for path in page.paths:
 62        p0 = path.points[0]
 63        if rotation:
 64            obj = pp.raw.FPDFPageObj_CreateNewPath(height - p0.y, p0.x)
 65        else:
 66            obj = pp.raw.FPDFPageObj_CreateNewPath(p0.x, p0.y)
 67        assert pp.raw.FPDFPageObj_SetStrokeColor(obj, 0, 0, 0xFF, 0xC0)
 68        assert pp.raw.FPDFPageObj_SetStrokeWidth(obj, 0.25)
 69        assert pp.raw.FPDFPageObj_SetLineJoin(obj, pp.raw.FPDF_LINEJOIN_ROUND)
 70        assert pp.raw.FPDFPageObj_SetLineCap(obj, pp.raw.FPDF_LINECAP_ROUND)
 71        assert pp.raw.FPDFPath_SetDrawMode(obj, 0, True)
 72        for point in path.points[1:]:
 73            if point.type == path.Type.MOVE:
 74                if rotation:
 75                    assert pp.raw.FPDFPath_MoveTo(obj, height - point.y, point.x)
 76                else:
 77                    assert pp.raw.FPDFPath_MoveTo(obj, point.x, point.y)
 78            else:
 79                if rotation:
 80                    assert pp.raw.FPDFPath_LineTo(obj, height - point.y, point.x)
 81                else:
 82                    assert pp.raw.FPDFPath_LineTo(obj, point.x, point.y)
 83        pp.raw.FPDFPage_InsertObject(new_page, obj)
 84
 85    for bbox, _ in page.graphic_clusters():
 86        _rect(new_page, rotation, bbox, width=2, stroke=0x00FFFF)
 87
 88    for link in page.objlinks:
 89        _rect(new_page, rotation, link.bbox, width=0.75, stroke=0x9ACD32)
 90
 91    for link in page.weblinks:
 92        for bbox in link.bboxes:
 93            _rect(new_page, rotation, bbox, width=0.75, stroke=0x00FF00)
 94
 95    for char in page.chars:
 96        color = 0x0000FF
 97        if char.bbox.width:
 98            _rect(new_page, rotation, char.bbox, width=0.5, stroke=0xFF0000)
 99            _vline(
100                new_page,
101                rotation,
102                char.bbox.midpoint.x,
103                char.bbox.midpoint.y - 1,
104                char.bbox.midpoint.y + 1,
105                width=0.25,
106                stroke=0xFF0000,
107            )
108            _hline(
109                new_page,
110                rotation,
111                char.bbox.midpoint.y,
112                char.bbox.midpoint.x - 1,
113                char.bbox.midpoint.x + 1,
114                width=0.25,
115                stroke=0xFF0000,
116            )
117            color = 0x000000
118        _vline(new_page, rotation, char.origin.x, char.origin.y - 1, char.origin.y + 1, width=0.25, stroke=color)
119        _hline(new_page, rotation, char.origin.y, char.origin.x - 1, char.origin.x + 1, width=0.25, stroke=color)
120
121    assert pp.raw.FPDFPage_GenerateContent(new_page)
122    pp.raw.FPDF_ClosePage(new_page)
123    return new_doc