modm_data.pdf

PDF Content Accessors

This module extends the pypdfium2 Python API with low-level accessors for characters and graphics. Note that these modules support read-only access to PDFs, since a lot of caching is used to speed up commonly accessed properties.

This module only contains formatting independent PDF access which is then specialized in the vendor-specific modm_data.pdf2html modules.

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4"""
 5# PDF Content Accessors
 6
 7This module extends the pypdfium2 Python API with low-level accessors for
 8characters and graphics. Note that these modules support read-only access to
 9PDFs, since a lot of caching is used to speed up commonly accessed properties.
10
11This module only contains formatting independent PDF access which is then
12specialized in the vendor-specific `modm_data.pdf2html` modules.
13"""
14
15from .document import Document
16from .page import Page
17from .character import Character
18from .link import ObjLink, WebLink
19from .path import Path
20from .image import Image
21from .render import annotate_debug_info
22from .structure import Structure
23
24__all__ = [
25    "annotate_debug_info",
26    "Document",
27    "Page",
28    "Character",
29    "Path",
30    "Image",
31    "ObjLink",
32    "WebLink",
33    "Structure",
34]
def annotate_debug_info( page: Page, new_doc: pypdfium2._helpers.document.PdfDocument = None, index: int = 0) -> pypdfium2._helpers.document.PdfDocument:
 52def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument:
 53    """
 54    Copies each page into a new or existing PDF document and overlays the internal information on top of the content.
 55    - Renders the bounding boxes in RED and origins in BLACK of all characters.
 56    - Renders the bounding boxes of web links in BLUE GREEN.
 57    - Renders the bounding boxes of object links in YELLOW GREEN.
 58    - Renders all graphics paths in BLUE.
 59    - Renders the bounding boxes of computed graphics clusters in CYAN.
 60
 61    :param page: The page to be annotated.
 62    :param new_doc: The PDF document to copy the page to. If not provided, a new document is created.
 63    :param index: The index of the page in the new document.
 64    :return: The new document with the annotated page added.
 65    """
 66    _, height = page.width, page.height
 67
 68    if new_doc is None:
 69        new_doc = pp.raw.FPDF_CreateNewDocument()
 70    # copy page over to new doc
 71    assert pp.raw.FPDF_ImportPages(new_doc, page.pdf, str(page.number).encode("ascii"), index)
 72    new_page = pp.raw.FPDF_LoadPage(new_doc, index)
 73    rotation = page.rotation
 74
 75    for path in page.paths:
 76        p0 = path.points[0]
 77        if rotation:
 78            obj = pp.raw.FPDFPageObj_CreateNewPath(height - p0.y, p0.x)
 79        else:
 80            obj = pp.raw.FPDFPageObj_CreateNewPath(p0.x, p0.y)
 81        assert pp.raw.FPDFPageObj_SetStrokeColor(obj, 0, 0, 0xFF, 0xC0)
 82        assert pp.raw.FPDFPageObj_SetStrokeWidth(obj, 0.25)
 83        assert pp.raw.FPDFPageObj_SetLineJoin(obj, pp.raw.FPDF_LINEJOIN_ROUND)
 84        assert pp.raw.FPDFPageObj_SetLineCap(obj, pp.raw.FPDF_LINECAP_ROUND)
 85        assert pp.raw.FPDFPath_SetDrawMode(obj, 0, True)
 86        for point in path.points[1:]:
 87            if point.type == path.Type.MOVE:
 88                if rotation:
 89                    assert pp.raw.FPDFPath_MoveTo(obj, height - point.y, point.x)
 90                else:
 91                    assert pp.raw.FPDFPath_MoveTo(obj, point.x, point.y)
 92            else:
 93                if rotation:
 94                    assert pp.raw.FPDFPath_LineTo(obj, height - point.y, point.x)
 95                else:
 96                    assert pp.raw.FPDFPath_LineTo(obj, point.x, point.y)
 97        pp.raw.FPDFPage_InsertObject(new_page, obj)
 98
 99    for bbox, _ in page.graphic_clusters():
100        _rect(new_page, rotation, bbox, width=2, stroke=0x00FFFF)
101
102    for link in page.objlinks:
103        _rect(new_page, rotation, link.bbox, width=0.75, stroke=0x9ACD32)
104
105    for link in page.weblinks:
106        for bbox in link.bboxes:
107            _rect(new_page, rotation, bbox, width=0.75, stroke=0x00FF00)
108
109    for char in page.chars:
110        color = 0x0000FF
111        if char.bbox.width:
112            _rect(new_page, rotation, char.bbox, width=0.5, stroke=0xFF0000)
113            _vline(
114                new_page,
115                rotation,
116                char.bbox.midpoint.x,
117                char.bbox.midpoint.y - 1,
118                char.bbox.midpoint.y + 1,
119                width=0.25,
120                stroke=0xFF0000,
121            )
122            _hline(
123                new_page,
124                rotation,
125                char.bbox.midpoint.y,
126                char.bbox.midpoint.x - 1,
127                char.bbox.midpoint.x + 1,
128                width=0.25,
129                stroke=0xFF0000,
130            )
131            color = 0x000000
132        _vline(new_page, rotation, char.origin.x, char.origin.y - 1, char.origin.y + 1, width=0.25, stroke=color)
133        _hline(new_page, rotation, char.origin.y, char.origin.x - 1, char.origin.x + 1, width=0.25, stroke=color)
134
135    assert pp.raw.FPDFPage_GenerateContent(new_page)
136    pp.raw.FPDF_ClosePage(new_page)
137    return new_doc

Copies each page into a new or existing PDF document and overlays the internal information on top of the content.

  • Renders the bounding boxes in RED and origins in BLACK of all characters.
  • Renders the bounding boxes of web links in BLUE GREEN.
  • Renders the bounding boxes of object links in YELLOW GREEN.
  • Renders all graphics paths in BLUE.
  • Renders the bounding boxes of computed graphics clusters in CYAN.
Parameters
  • page: The page to be annotated.
  • new_doc: The PDF document to copy the page to. If not provided, a new document is created.
  • index: The index of the page in the new document.
Returns

The new document with the annotated page added.

class Document(pypdfium2._helpers.document.PdfDocument):
 31class Document(pp.PdfDocument):
 32    """
 33    The PDF document is the root of the entire data structure and provides
 34    access to PDF metadata, the table of contents, as well as individual
 35    pages.
 36
 37    You should extend from this class for a specific vendor to provide the
 38    correct page class from `page()` function.
 39
 40    This class is a convenience wrapper with caching around the high-level APIs
 41    of pypdfium.
 42    """
 43
 44    def __init__(self, path: Path, autoclose: bool = False):
 45        """
 46        :param path: Path to the PDF to open.
 47        """
 48        path = Path(path)
 49        self.name: str = path.stem
 50        """Stem of the document file name"""
 51        super().__init__(path, autoclose=autoclose)
 52        self._path = path
 53        self._bbox_cache = defaultdict(dict)
 54        _LOGGER.debug(f"Loading: {path}")
 55
 56    @cached_property
 57    def metadata(self) -> dict[str, str]:
 58        """The PDF metadata dictionary."""
 59        return self.get_metadata_dict()
 60
 61    @property
 62    def destinations(self) -> Iterator[tuple[int, str]]:
 63        """Yields (page 0-index, named destination) of the whole document."""
 64        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
 65            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
 66            clength = ctypes.c_long(length)
 67            cbuffer = ctypes.create_string_buffer(length * 2)
 68            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
 69            name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00")
 70            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
 71            yield (page, name)
 72
 73    @cached_property
 74    def toc(self) -> list[pp.PdfOutlineItem]:
 75        """
 76        The table of content as a sorted list of outline items ensuring item has
 77        a page index by reusing the last one.
 78        """
 79        tocs = set()
 80        # Sometimes the TOC contains duplicates so we must use a set
 81        last_page_index = 0
 82        for toc in self.get_toc():
 83            outline = _OutlineItem(
 84                toc.level,
 85                toc.title,
 86                toc.is_closed,
 87                toc.n_kids,
 88                toc.page_index or last_page_index,
 89                toc.view_mode,
 90                toc.view_pos,
 91            )
 92            last_page_index = toc.page_index or last_page_index
 93            tocs.add(outline)
 94        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
 95
 96    @cached_property
 97    def identifier_permanent(self) -> str:
 98        """The permanent file identifier."""
 99        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
100
101    @cached_property
102    def identifier_changing(self) -> str:
103        """The changing file identifier."""
104        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
105
106    @cached_property
107    def page_count(self) -> int:
108        """The number of pages in the document."""
109        return pp.raw.FPDF_GetPageCount(self)
110
111    @cache
112    def page(self, index: int) -> Page:
113        """
114        :param index: 0-indexed page number.
115        :return: the page object for the index.
116        """
117        assert index < self.page_count
118        return Page(self, index)
119
120    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
121        """
122        :param numbers: an iterable range of page numbers (0-indexed!).
123                        If `None`, then the whole page range is used.
124        :return: yields each page in the range.
125        """
126        if numbers is None:
127            numbers = range(self.page_count)
128        for ii in numbers:
129            if 0 <= ii < self.page_count:
130                yield self.page(ii)
131
132    def __repr__(self) -> str:
133        return f"Doc({self.name})"

The PDF document is the root of the entire data structure and provides access to PDF metadata, the table of contents, as well as individual pages.

You should extend from this class for a specific vendor to provide the correct page class from page() function.

This class is a convenience wrapper with caching around the high-level APIs of pypdfium.

Document(path: pathlib.Path, autoclose: bool = False)
44    def __init__(self, path: Path, autoclose: bool = False):
45        """
46        :param path: Path to the PDF to open.
47        """
48        path = Path(path)
49        self.name: str = path.stem
50        """Stem of the document file name"""
51        super().__init__(path, autoclose=autoclose)
52        self._path = path
53        self._bbox_cache = defaultdict(dict)
54        _LOGGER.debug(f"Loading: {path}")
Parameters
  • path: Path to the PDF to open.
name: str

Stem of the document file name

metadata: dict[str, str]
56    @cached_property
57    def metadata(self) -> dict[str, str]:
58        """The PDF metadata dictionary."""
59        return self.get_metadata_dict()

The PDF metadata dictionary.

destinations: Iterator[tuple[int, str]]
61    @property
62    def destinations(self) -> Iterator[tuple[int, str]]:
63        """Yields (page 0-index, named destination) of the whole document."""
64        for ii in range(pp.raw.FPDF_CountNamedDests(self)):
65            length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0)
66            clength = ctypes.c_long(length)
67            cbuffer = ctypes.create_string_buffer(length * 2)
68            dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength)
69            name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00")
70            page = pp.raw.FPDFDest_GetDestPageIndex(self, dest)
71            yield (page, name)

Yields (page 0-index, named destination) of the whole document.

toc: list[pypdfium2._helpers.document.PdfOutlineItem]
73    @cached_property
74    def toc(self) -> list[pp.PdfOutlineItem]:
75        """
76        The table of content as a sorted list of outline items ensuring item has
77        a page index by reusing the last one.
78        """
79        tocs = set()
80        # Sometimes the TOC contains duplicates so we must use a set
81        last_page_index = 0
82        for toc in self.get_toc():
83            outline = _OutlineItem(
84                toc.level,
85                toc.title,
86                toc.is_closed,
87                toc.n_kids,
88                toc.page_index or last_page_index,
89                toc.view_mode,
90                toc.view_pos,
91            )
92            last_page_index = toc.page_index or last_page_index
93            tocs.add(outline)
94        return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))

The table of content as a sorted list of outline items ensuring item has a page index by reusing the last one.

identifier_permanent: str
96    @cached_property
97    def identifier_permanent(self) -> str:
98        """The permanent file identifier."""
99        return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)

The permanent file identifier.

identifier_changing: str
101    @cached_property
102    def identifier_changing(self) -> str:
103        """The changing file identifier."""
104        return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)

The changing file identifier.

page_count: int
106    @cached_property
107    def page_count(self) -> int:
108        """The number of pages in the document."""
109        return pp.raw.FPDF_GetPageCount(self)

The number of pages in the document.

@cache
def page(self, index: int) -> Page:
111    @cache
112    def page(self, index: int) -> Page:
113        """
114        :param index: 0-indexed page number.
115        :return: the page object for the index.
116        """
117        assert index < self.page_count
118        return Page(self, index)
Parameters
  • index: 0-indexed page number.
Returns

the page object for the index.

def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
120    def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]:
121        """
122        :param numbers: an iterable range of page numbers (0-indexed!).
123                        If `None`, then the whole page range is used.
124        :return: yields each page in the range.
125        """
126        if numbers is None:
127            numbers = range(self.page_count)
128        for ii in numbers:
129            if 0 <= ii < self.page_count:
130                yield self.page(ii)
Parameters
  • numbers: an iterable range of page numbers (0-indexed!). If None, then the whole page range is used.
Returns

yields each page in the range.

Inherited Members
pypdfium2._helpers.document.PdfDocument
formenv
parent
new
init_forms
get_formtype
get_pagemode
is_tagged
save
get_identifier
get_version
get_metadata_value
METADATA_KEYS
get_metadata_dict
count_attachments
get_attachment
new_attachment
del_attachment
get_page
new_page
del_page
import_pages
get_page_size
get_page_label
page_as_xobject
get_toc
render
pypdfium2.internal.bases.AutoCloseable
close
class Page(pypdfium2._helpers.page.PdfPage):
 24class Page(pp.PdfPage):
 25    """
 26    This class provides low-level access to graphics and characters of the page.
 27    It also fixes missing bounding boxes for rotates characters on page load,
 28    as well as allow searching for characters in an area instead of just text.
 29    """
 30
 31    def __init__(self, document: "modm_data.pdf.Document", index: int):  # noqa: F821
 32        """
 33        :param document: a PDF document.
 34        :param index: 0-index page number.
 35        """
 36        self.index = index
 37        """0-index page number."""
 38        self.number = index + 1
 39        """1-index page number."""
 40
 41        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
 42        self._links = None
 43        self._weblinks = None
 44        self._linked = False
 45
 46        _LOGGER.debug(f"Loading: {index}")
 47
 48        self._text = self.get_textpage()
 49        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
 50        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
 51        # close them in reverse order
 52        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
 53        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
 54
 55        self._fix_bboxes()
 56
 57    @cached_property
 58    def label(self) -> str:
 59        """The page label."""
 60        return self.pdf.get_page_label(self.index)
 61
 62    @cached_property
 63    def width(self) -> float:
 64        """The page width."""
 65        return self.get_width()
 66
 67    @cached_property
 68    def height(self) -> float:
 69        """The page height."""
 70        return self.get_height()
 71
 72    @cached_property
 73    def rotation(self) -> int:
 74        """The page rotation in degrees."""
 75        return self.get_rotation()
 76
 77    @cached_property
 78    def bbox(self) -> Rectangle:
 79        """The page bounding box."""
 80        return Rectangle(*self.get_bbox())
 81
 82    @cached_property
 83    def char_count(self) -> int:
 84        """The total count of characters."""
 85        return self._text.count_chars()
 86
 87    @cache
 88    def char(self, index: int) -> Character:
 89        """:return: The character at the 0-index."""
 90        return Character(self, index)
 91
 92    @property
 93    def chars(self) -> Iterator[Character]:
 94        """Yields all characters."""
 95        for ii in range(self.char_count):
 96            yield self.char(ii)
 97
 98    @cached_property
 99    def objlinks(self) -> list[ObjLink]:
100        """All object links."""
101        links = []
102        pos = ctypes.c_int(0)
103        link = pp.raw.FPDF_LINK()
104        while pp.raw.FPDFLink_Enumerate(self, pos, link):
105            links.append(ObjLink(self, link))
106        return links
107
108    @cached_property
109    def weblinks(self) -> list[WebLink]:
110        """All web links."""
111        links = []
112        for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)):
113            links.append(WebLink(self, ii))
114        return links
115
116    def chars_in_area(self, area: Rectangle) -> list[Character]:
117        """
118        :param area: area to search for character in.
119        :return: All characters found in the area.
120        """
121        found = []
122        # We perform binary searches of the lower and upper y-positions first
123        # lines are ordered by y-position
124        ypositions = list(self._charlines.keys())
125        y_bottom = bisect_left(ypositions, area.bottom)
126        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
127
128        # Then for every line we do another binary search for left and right
129        for ypos in ypositions[y_bottom:y_top]:
130            chars = self._charlines[ypos]
131            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
132            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
133            # Finally we add all these characters
134            found.extend(chars[x_left:x_right])
135        return found
136
137    def text_in_area(self, area: Rectangle) -> str:
138        """
139        :param area: area to search for text in.
140        :return: Only the text found in the area.
141        """
142        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
143
144    @property
145    def structures(self) -> Iterator[Structure]:
146        """The PDF/UA tags."""
147        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
148        for ii in range(count):
149            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
150            yield Structure(self, child)
151
152    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
153        """
154        Searches for a match string as whole, consecutive words and yields the
155        characters.
156
157        :param string: The search string.
158        :param case_sensitive: Ignore case if false.
159        :return: yields the characters found.
160        """
161        searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True)
162        while idx := searcher.get_next():
163            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
164            yield chars
165
166    @cached_property
167    def paths(self) -> list[Path]:
168        """All paths."""
169        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
170
171    @cached_property
172    def images(self) -> list[Image]:
173        """All images."""
174        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
175
176    def graphic_clusters(
177        self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None
178    ) -> list[tuple[Rectangle, list[Path]]]:
179        if absolute_tolerance is None:
180            absolute_tolerance = min(self.width, self.height) * 0.01
181
182        # First collect all vertical regions
183        filtered_paths = []
184        for path in self.paths:
185            if predicate is None or predicate(path):
186                filtered_paths.append(path)
187        for image in self.images:
188            if predicate is None or predicate(image):
189                filtered_paths.append(image)
190
191        regions = []
192        for path in sorted(filtered_paths, key=lambda path: path.bbox.y):
193            for reg in regions:
194                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
195                    # They overlap, so merge them
196                    reg.v0 = min(reg.v0, path.bbox.bottom)
197                    reg.v1 = max(reg.v1, path.bbox.top)
198                    reg.objs.append(path)
199                    break
200            else:
201                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
202
203        # Now collect horizontal region inside each vertical region
204        for yreg in regions:
205            for path in sorted(filtered_paths, key=lambda path: path.bbox.x):
206                # check if horizontal line is contained in vregion
207                if yreg.contains(path.bbox.y, absolute_tolerance):
208                    for xreg in yreg.subregions:
209                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
210                            # They overlap so merge them
211                            xreg.v0 = min(xreg.v0, path.bbox.left)
212                            xreg.v1 = max(xreg.v1, path.bbox.right)
213                            xreg.objs.append(path)
214                            break
215                    else:
216                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
217
218        clusters = []
219        for yreg in regions:
220            for xreg in yreg.subregions:
221                if len(yreg.subregions) > 1:
222                    # Strip down the height again for subregions
223                    y0, y1 = 1e9, 0
224                    for path in xreg.objs:
225                        y0 = min(y0, path.bbox.bottom)
226                        y1 = max(y1, path.bbox.top)
227                else:
228                    y0, y1 = yreg.v0, yreg.v1
229                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
230                clusters.append((bbox, xreg.objs))
231
232        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
233
234    def _link_characters(self):
235        if self._linked:
236            return
237        # The in-document links only gives us rectangles and we must find the
238        # linked chars ourselves
239        for link in self.objlinks:
240            for char in self.chars_in_area(link.bbox):
241                char.objlink = link
242        # The weblinks give you an explicit char range, very convenient
243        for link in self.weblinks:
244            for ii in range(*link.range):
245                self.char(ii).weblink = link
246        self._linked = True
247
248    @cached_property
249    def _charlines(self):
250        charlines = defaultdict(list)
251        for char in self.chars:
252            charlines[round(char.bbox.midpoint.y, 1)].append(char)
253
254        orderedchars = OrderedDict.fromkeys(sorted(charlines))
255        for ypos, chars in charlines.items():
256            orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x)
257
258        return orderedchars
259
260    def _fix_bboxes(self):
261        def _key(char):
262            height = round(char.tbbox.height, 1)
263            width = round(char.tbbox.width, 1)
264            return f"{char.font} {char.unicode} {height} {width}"
265
266        fix_chars = []
267        for char in self.chars:
268            if not char._bbox.width or not char._bbox.height:
269                if char._rotation:
270                    fix_chars.append(char)
271                elif char.unicode not in {0xA, 0xD}:
272                    fix_chars.append(char)
273            elif char.unicode not in {0xA, 0xD} and not char._rotation and _key(char) not in self.pdf._bbox_cache:
274                bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation)
275                self.pdf._bbox_cache[_key(char)] = (char, bbox)
276                # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation)
277        for char in fix_chars:
278            bbox = self.pdf._bbox_cache.get(_key(char))
279            if bbox is not None:
280                # print("<-", char.descr(), char._rotation, char.rotation, char.height)
281                _, bbox = bbox
282                bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin)
283                char._bbox = bbox
284            elif char.unicode not in {0x20, 0xA, 0xD}:
285                _LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")

This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.

Page(document: Document, index: int)
31    def __init__(self, document: "modm_data.pdf.Document", index: int):  # noqa: F821
32        """
33        :param document: a PDF document.
34        :param index: 0-index page number.
35        """
36        self.index = index
37        """0-index page number."""
38        self.number = index + 1
39        """1-index page number."""
40
41        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
42        self._links = None
43        self._weblinks = None
44        self._linked = False
45
46        _LOGGER.debug(f"Loading: {index}")
47
48        self._text = self.get_textpage()
49        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
50        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
51        # close them in reverse order
52        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
53        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
54
55        self._fix_bboxes()
Parameters
  • document: a PDF document.
  • index: 0-index page number.
index

0-index page number.

number

1-index page number.

label: str
57    @cached_property
58    def label(self) -> str:
59        """The page label."""
60        return self.pdf.get_page_label(self.index)

The page label.

width: float
62    @cached_property
63    def width(self) -> float:
64        """The page width."""
65        return self.get_width()

The page width.

height: float
67    @cached_property
68    def height(self) -> float:
69        """The page height."""
70        return self.get_height()

The page height.

rotation: int
72    @cached_property
73    def rotation(self) -> int:
74        """The page rotation in degrees."""
75        return self.get_rotation()

The page rotation in degrees.

bbox: modm_data.utils.Rectangle
77    @cached_property
78    def bbox(self) -> Rectangle:
79        """The page bounding box."""
80        return Rectangle(*self.get_bbox())

The page bounding box.

char_count: int
82    @cached_property
83    def char_count(self) -> int:
84        """The total count of characters."""
85        return self._text.count_chars()

The total count of characters.

@cache
def char(self, index: int) -> Character:
87    @cache
88    def char(self, index: int) -> Character:
89        """:return: The character at the 0-index."""
90        return Character(self, index)
Returns

The character at the 0-index.

chars: Iterator[Character]
92    @property
93    def chars(self) -> Iterator[Character]:
94        """Yields all characters."""
95        for ii in range(self.char_count):
96            yield self.char(ii)

Yields all characters.

def chars_in_area( self, area: modm_data.utils.Rectangle) -> list[Character]:
116    def chars_in_area(self, area: Rectangle) -> list[Character]:
117        """
118        :param area: area to search for character in.
119        :return: All characters found in the area.
120        """
121        found = []
122        # We perform binary searches of the lower and upper y-positions first
123        # lines are ordered by y-position
124        ypositions = list(self._charlines.keys())
125        y_bottom = bisect_left(ypositions, area.bottom)
126        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
127
128        # Then for every line we do another binary search for left and right
129        for ypos in ypositions[y_bottom:y_top]:
130            chars = self._charlines[ypos]
131            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
132            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
133            # Finally we add all these characters
134            found.extend(chars[x_left:x_right])
135        return found
Parameters
  • area: area to search for character in.
Returns

All characters found in the area.

def text_in_area(self, area: modm_data.utils.Rectangle) -> str:
137    def text_in_area(self, area: Rectangle) -> str:
138        """
139        :param area: area to search for text in.
140        :return: Only the text found in the area.
141        """
142        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
Parameters
  • area: area to search for text in.
Returns

Only the text found in the area.

structures: Iterator[Structure]
144    @property
145    def structures(self) -> Iterator[Structure]:
146        """The PDF/UA tags."""
147        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
148        for ii in range(count):
149            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
150            yield Structure(self, child)

The PDF/UA tags.

def find( self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
152    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
153        """
154        Searches for a match string as whole, consecutive words and yields the
155        characters.
156
157        :param string: The search string.
158        :param case_sensitive: Ignore case if false.
159        :return: yields the characters found.
160        """
161        searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True)
162        while idx := searcher.get_next():
163            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
164            yield chars

Searches for a match string as whole, consecutive words and yields the characters.

Parameters
  • string: The search string.
  • case_sensitive: Ignore case if false.
Returns

yields the characters found.

paths: list[Path]
166    @cached_property
167    def paths(self) -> list[Path]:
168        """All paths."""
169        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]

All paths.

images: list[Image]
171    @cached_property
172    def images(self) -> list[Image]:
173        """All images."""
174        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]

All images.

def graphic_clusters( self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None) -> list[tuple[modm_data.utils.Rectangle, list[Path]]]:
176    def graphic_clusters(
177        self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None
178    ) -> list[tuple[Rectangle, list[Path]]]:
179        if absolute_tolerance is None:
180            absolute_tolerance = min(self.width, self.height) * 0.01
181
182        # First collect all vertical regions
183        filtered_paths = []
184        for path in self.paths:
185            if predicate is None or predicate(path):
186                filtered_paths.append(path)
187        for image in self.images:
188            if predicate is None or predicate(image):
189                filtered_paths.append(image)
190
191        regions = []
192        for path in sorted(filtered_paths, key=lambda path: path.bbox.y):
193            for reg in regions:
194                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
195                    # They overlap, so merge them
196                    reg.v0 = min(reg.v0, path.bbox.bottom)
197                    reg.v1 = max(reg.v1, path.bbox.top)
198                    reg.objs.append(path)
199                    break
200            else:
201                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
202
203        # Now collect horizontal region inside each vertical region
204        for yreg in regions:
205            for path in sorted(filtered_paths, key=lambda path: path.bbox.x):
206                # check if horizontal line is contained in vregion
207                if yreg.contains(path.bbox.y, absolute_tolerance):
208                    for xreg in yreg.subregions:
209                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
210                            # They overlap so merge them
211                            xreg.v0 = min(xreg.v0, path.bbox.left)
212                            xreg.v1 = max(xreg.v1, path.bbox.right)
213                            xreg.objs.append(path)
214                            break
215                    else:
216                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
217
218        clusters = []
219        for yreg in regions:
220            for xreg in yreg.subregions:
221                if len(yreg.subregions) > 1:
222                    # Strip down the height again for subregions
223                    y0, y1 = 1e9, 0
224                    for path in xreg.objs:
225                        y0 = min(y0, path.bbox.bottom)
226                        y1 = max(y1, path.bbox.top)
227                else:
228                    y0, y1 = yreg.v0, yreg.v1
229                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
230                clusters.append((bbox, xreg.objs))
231
232        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
Inherited Members
pypdfium2._helpers.page.PdfPage
parent
get_width
get_height
get_size
get_rotation
set_rotation
get_mediabox
set_mediabox
get_cropbox
set_cropbox
get_bleedbox
set_bleedbox
get_trimbox
set_trimbox
get_artbox
set_artbox
get_bbox
get_textpage
insert_obj
remove_obj
gen_content
get_objects
render
pypdfium2.internal.bases.AutoCloseable
close
class Character:
 13class Character:
 14    """
 15    Each character on the PDF page is represented by a character object,
 16    describing exactly where and how to render the associated glyph.
 17
 18    While there are font flags, PDF files typically use entirely different fonts
 19    to render normal, bold, and italic characters.
 20
 21    The character's loose bounding box may not always be available, since it
 22    must be explicitly provided by the font. The tight bounding box is only
 23    available as long as the glyph is renderable, so a space character may have
 24    a loose, but not a tight bounding box, or none at all.
 25    """
 26
 27    class RenderMode(Enum):
 28        """Tells the PDF viewer how to render this character glyph."""
 29
 30        UNKNOWN = -1
 31        FILL = 0
 32        STROKE = 1
 33        FILL_STROKE = 2
 34        INVISIBLE = 3
 35        FILL_CLIP = 4
 36        STROKE_CLIP = 5
 37        FILL_STROKE_CLIP = 6
 38        CLIP = 7
 39
 40    def __init__(self, page: "modm_data.pdf.page.Page", index: int):  # noqa: F821
 41        """
 42        :param page: The page containing the character.
 43        :param index: The index of the character.
 44        """
 45        self._page = page
 46        self._text = page._text
 47        self._index = index
 48        self._font = None
 49        self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index)))
 50
 51        self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index)
 52        """The unicode value of the character."""
 53        self.objlink: "modm_data.pdf.link.ObjLink" = None  # noqa: F821
 54        """The object link of this character or `None`"""
 55        self.weblink: "modm_data.pdf.link.WebLink" = None  # noqa: F821
 56        """The web link of this character or `None`"""
 57
 58        bbox = Rectangle(*self._text.get_charbox(self._index, loose=True))
 59        if self._page.rotation:
 60            bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x)
 61        self._bbox = bbox
 62
 63    def _font_flags(self) -> tuple[str, int]:
 64        if self._font is None:
 65            font = ctypes.create_string_buffer(255)
 66            flags = ctypes.c_int()
 67            pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags)
 68            self._font = (font.value.decode("utf-8"), flags.value)
 69        return self._font
 70
 71    @property
 72    def char(self) -> str:
 73        """The printable string of the unicode value."""
 74        char = chr(self.unicode)
 75        return char if char.isprintable() else ""
 76
 77    @cached_property
 78    def origin(self) -> Point:
 79        """The origin of the character."""
 80        x, y = ctypes.c_double(), ctypes.c_double()
 81        assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y)
 82        if self._page.rotation:
 83            return Point(y.value, self._page.height - x.value)
 84        return Point(x.value, y.value)
 85
 86    @cached_property
 87    def width(self) -> float:
 88        """The width of the character's bounding box."""
 89        if self.rotation:
 90            return self.bbox.height
 91        return self.bbox.width
 92
 93    @cached_property
 94    def height(self) -> float:
 95        """The height of the character's bounding box."""
 96        if self.rotation:
 97            return self.bbox.width
 98        return self.bbox.height
 99
100    @cached_property
101    def tbbox(self) -> Rectangle:
102        """The tight bounding box of the character."""
103        tbbox = Rectangle(*self._text.get_charbox(self._index))
104        if self._page.rotation:
105            tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x)
106        return tbbox
107
108    @property
109    def bbox(self) -> Rectangle:
110        """
111        The loose bounding box of the character.
112        .. note::
113            If the loose bounding box is not available, the tight bounding box
114            is used instead.
115        """
116        if not self._bbox.width or not self._bbox.height:
117            return self.tbbox
118        return self._bbox
119
120    @cached_property
121    def twidth(self) -> float:
122        """The width of the character's tight bounding box."""
123        return self.tbbox.width
124
125    @cached_property
126    def theight(self) -> float:
127        """The height of the character's tight bounding box."""
128        return self.tbbox.height
129
130    @cached_property
131    def render_mode(self) -> RenderMode:
132        """The render mode of the character."""
133        return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))
134
135    @cached_property
136    def rotation(self) -> int:
137        """The rotation of the character in degrees modulo 360."""
138        # Special case for vertical text in rotated pages
139        if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}:
140            return 90
141        if self._page.rotation and self._rotation:
142            return (self._page.rotation + self._rotation) % 360
143        return self._rotation
144
145    @cached_property
146    def size(self) -> float:
147        """The font size of the character."""
148        return pp.raw.FPDFText_GetFontSize(self._text, self._index)
149
150    @cached_property
151    def weight(self) -> int:
152        """The font weight of the character."""
153        return pp.raw.FPDFText_GetFontWeight(self._text, self._index)
154
155    @cached_property
156    def fill(self) -> int:
157        """The fill color of the character."""
158        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
159        pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a)
160        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
161
162    @cached_property
163    def stroke(self) -> int:
164        """The stroke color of the character."""
165        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
166        pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a)
167        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
168
169    @cached_property
170    def font(self) -> str:
171        """The font name of the character."""
172        return self._font_flags()[0]
173
174    @cached_property
175    def flags(self) -> int:
176        """The font flags of the character."""
177        return self._font_flags()[1]
178
179    def descr(self) -> str:
180        """Human-readable description of the character for debugging."""
181        char = chr(self.unicode)
182        if not char.isprintable():
183            char = hex(self.unicode)
184        return (
185            f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, "
186            f"{self.render_mode}, {self.font}, {hex(self.flags)}, "
187            f"{self.fill}, {self.stroke}, {repr(self.bbox)})"
188        )
189
190    def __str__(self) -> str:
191        return self.char
192
193    def __repr__(self) -> str:
194        char = chr(self.unicode)
195        escape = {0xA: "\\n", 0xD: "\\r", 0x9: "\\t", 0x20: "␣"}
196        char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode))
197        return char

Each character on the PDF page is represented by a character object, describing exactly where and how to render the associated glyph.

While there are font flags, PDF files typically use entirely different fonts to render normal, bold, and italic characters.

The character's loose bounding box may not always be available, since it must be explicitly provided by the font. The tight bounding box is only available as long as the glyph is renderable, so a space character may have a loose, but not a tight bounding box, or none at all.

Character(page: Page, index: int)
40    def __init__(self, page: "modm_data.pdf.page.Page", index: int):  # noqa: F821
41        """
42        :param page: The page containing the character.
43        :param index: The index of the character.
44        """
45        self._page = page
46        self._text = page._text
47        self._index = index
48        self._font = None
49        self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index)))
50
51        self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index)
52        """The unicode value of the character."""
53        self.objlink: "modm_data.pdf.link.ObjLink" = None  # noqa: F821
54        """The object link of this character or `None`"""
55        self.weblink: "modm_data.pdf.link.WebLink" = None  # noqa: F821
56        """The web link of this character or `None`"""
57
58        bbox = Rectangle(*self._text.get_charbox(self._index, loose=True))
59        if self._page.rotation:
60            bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x)
61        self._bbox = bbox
Parameters
  • page: The page containing the character.
  • index: The index of the character.
unicode: int

The unicode value of the character.

char: str
71    @property
72    def char(self) -> str:
73        """The printable string of the unicode value."""
74        char = chr(self.unicode)
75        return char if char.isprintable() else ""

The printable string of the unicode value.

origin: modm_data.utils.Point
77    @cached_property
78    def origin(self) -> Point:
79        """The origin of the character."""
80        x, y = ctypes.c_double(), ctypes.c_double()
81        assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y)
82        if self._page.rotation:
83            return Point(y.value, self._page.height - x.value)
84        return Point(x.value, y.value)

The origin of the character.

width: float
86    @cached_property
87    def width(self) -> float:
88        """The width of the character's bounding box."""
89        if self.rotation:
90            return self.bbox.height
91        return self.bbox.width

The width of the character's bounding box.

height: float
93    @cached_property
94    def height(self) -> float:
95        """The height of the character's bounding box."""
96        if self.rotation:
97            return self.bbox.width
98        return self.bbox.height

The height of the character's bounding box.

tbbox: modm_data.utils.Rectangle
100    @cached_property
101    def tbbox(self) -> Rectangle:
102        """The tight bounding box of the character."""
103        tbbox = Rectangle(*self._text.get_charbox(self._index))
104        if self._page.rotation:
105            tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x)
106        return tbbox

The tight bounding box of the character.

bbox: modm_data.utils.Rectangle
108    @property
109    def bbox(self) -> Rectangle:
110        """
111        The loose bounding box of the character.
112        .. note::
113            If the loose bounding box is not available, the tight bounding box
114            is used instead.
115        """
116        if not self._bbox.width or not self._bbox.height:
117            return self.tbbox
118        return self._bbox

The loose bounding box of the character.

If the loose bounding box is not available, the tight bounding box is used instead.

twidth: float
120    @cached_property
121    def twidth(self) -> float:
122        """The width of the character's tight bounding box."""
123        return self.tbbox.width

The width of the character's tight bounding box.

theight: float
125    @cached_property
126    def theight(self) -> float:
127        """The height of the character's tight bounding box."""
128        return self.tbbox.height

The height of the character's tight bounding box.

render_mode: Character.RenderMode
130    @cached_property
131    def render_mode(self) -> RenderMode:
132        """The render mode of the character."""
133        return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))

The render mode of the character.

rotation: int
135    @cached_property
136    def rotation(self) -> int:
137        """The rotation of the character in degrees modulo 360."""
138        # Special case for vertical text in rotated pages
139        if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}:
140            return 90
141        if self._page.rotation and self._rotation:
142            return (self._page.rotation + self._rotation) % 360
143        return self._rotation

The rotation of the character in degrees modulo 360.

size: float
145    @cached_property
146    def size(self) -> float:
147        """The font size of the character."""
148        return pp.raw.FPDFText_GetFontSize(self._text, self._index)

The font size of the character.

weight: int
150    @cached_property
151    def weight(self) -> int:
152        """The font weight of the character."""
153        return pp.raw.FPDFText_GetFontWeight(self._text, self._index)

The font weight of the character.

fill: int
155    @cached_property
156    def fill(self) -> int:
157        """The fill color of the character."""
158        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
159        pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a)
160        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The fill color of the character.

stroke: int
162    @cached_property
163    def stroke(self) -> int:
164        """The stroke color of the character."""
165        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
166        pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a)
167        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The stroke color of the character.

font: str
169    @cached_property
170    def font(self) -> str:
171        """The font name of the character."""
172        return self._font_flags()[0]

The font name of the character.

flags: int
174    @cached_property
175    def flags(self) -> int:
176        """The font flags of the character."""
177        return self._font_flags()[1]

The font flags of the character.

def descr(self) -> str:
179    def descr(self) -> str:
180        """Human-readable description of the character for debugging."""
181        char = chr(self.unicode)
182        if not char.isprintable():
183            char = hex(self.unicode)
184        return (
185            f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, "
186            f"{self.render_mode}, {self.font}, {hex(self.flags)}, "
187            f"{self.fill}, {self.stroke}, {repr(self.bbox)})"
188        )

Human-readable description of the character for debugging.

class Character.RenderMode(enum.Enum):
27    class RenderMode(Enum):
28        """Tells the PDF viewer how to render this character glyph."""
29
30        UNKNOWN = -1
31        FILL = 0
32        STROKE = 1
33        FILL_STROKE = 2
34        INVISIBLE = 3
35        FILL_CLIP = 4
36        STROKE_CLIP = 5
37        FILL_STROKE_CLIP = 6
38        CLIP = 7

Tells the PDF viewer how to render this character glyph.

UNKNOWN = <RenderMode.UNKNOWN: -1>
FILL = <RenderMode.FILL: 0>
STROKE = <RenderMode.STROKE: 1>
FILL_STROKE = <RenderMode.FILL_STROKE: 2>
INVISIBLE = <RenderMode.INVISIBLE: 3>
FILL_CLIP = <RenderMode.FILL_CLIP: 4>
STROKE_CLIP = <RenderMode.STROKE_CLIP: 5>
FILL_STROKE_CLIP = <RenderMode.FILL_STROKE_CLIP: 6>
CLIP = <RenderMode.CLIP: 7>
Inherited Members
enum.Enum
name
value
class Path(pypdfium2._helpers.pageobjects.PdfObject):
 12class Path(pp.PdfObject):
 13    """
 14    PDF uses a subset of the PostScript graphics language, which draws vector
 15    paths with various rendering options. We are only interested in the basic
 16    properties, in particular, for recognizing table cell borders.
 17
 18    This class specializes `pypdfium2.PdfObject` to add accessors for  graphics
 19    containing vector paths of various configurations.
 20
 21    You must construct the paths by calling `modm_data.pdf.page.Page.paths`.
 22    """
 23
 24    class Type(Enum):
 25        """Path Type"""
 26
 27        LINE = 0
 28        BEZIER = 1
 29        MOVE = 2
 30
 31    class Cap(Enum):
 32        """Path Cap Type"""
 33
 34        BUTT = 0
 35        ROUND = 1
 36        PROJECTING_SQUARE = 2
 37
 38    class Join(Enum):
 39        """Path Join Type"""
 40
 41        MITER = 0
 42        ROUND = 1
 43        BEVEL = 2
 44
 45    # Overwrite the PdfPageObject.__new__ function
 46    def __new__(cls, *args, **kwargs):
 47        return object.__new__(cls)
 48
 49    def __init__(self, obj):
 50        """
 51        :param obj: PDF object of the path.
 52        """
 53        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
 54        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH
 55        self.type = pp.raw.FPDF_PAGEOBJ_PATH
 56
 57    @cached_property
 58    def matrix(self) -> pp.PdfMatrix:
 59        """The transformation matrix."""
 60        return self.get_matrix()
 61
 62    @cached_property
 63    def count(self) -> int:
 64        """Number of segments in this path."""
 65        return pp.raw.FPDFPath_CountSegments(self)
 66
 67    @cached_property
 68    def fill(self) -> int:
 69        """The fill color encoded as 32-bit RGBA."""
 70        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
 71        assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a)
 72        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
 73
 74    @cached_property
 75    def stroke(self) -> int:
 76        """The stroke color encoded as 32-bit RGBA."""
 77        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
 78        assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a)
 79        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
 80
 81    @cached_property
 82    def width(self) -> float:
 83        """The stroke width."""
 84        width = ctypes.c_float()
 85        assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width)
 86        return width.value
 87
 88    @cached_property
 89    def cap(self) -> Cap:
 90        """Line cap type."""
 91        return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self))
 92
 93    @cached_property
 94    def join(self) -> Join:
 95        """Line join type."""
 96        return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self))
 97
 98    @cached_property
 99    def bbox(self) -> Rectangle:
100        """
101        Bounding box of the path.
102        .. warning::
103            The bounding is only approximated using the control points!
104            Therefore bezier curves will likely have a larger bounding box.
105        """
106        left, bottom = ctypes.c_float(), ctypes.c_float()
107        right, top = ctypes.c_float(), ctypes.c_float()
108        assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top)
109        bbox = Rectangle(left.value, bottom.value, right.value, top.value)
110        if self.page.rotation:
111            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
112        return bbox
113
114    @cached_property
115    def points(self) -> list[Point]:
116        """
117        List of points of the path. If the path is closed, the first point is
118        added to the end of the list.
119        """
120        points = []
121        for ii in range(self.count):
122            seg = pp.raw.FPDFPath_GetPathSegment(self, ii)
123            ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg))
124            # The first point should always be MOVETO
125            assert ii or ptype == Path.Type.MOVE
126
127            x, y = ctypes.c_float(), ctypes.c_float()
128            assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y)
129            x, y = self.matrix.on_point(x.value, y.value)
130            points.append(Point(x, y, type=ptype))
131
132            if pp.raw.FPDFPathSegment_GetClose(seg):
133                points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE))
134
135        if self.page.rotation:
136            points = [Point(y, self.page.height - x, type=p.type) for p in points]
137        return points
138
139    @cached_property
140    def lines(self) -> list[Line]:
141        """List of lines between the path points."""
142        points = self.points
143        return [
144            Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type)
145            for ii in range(len(points) - 1)
146        ]
147
148    def __repr__(self) -> str:
149        points = ",".join(repr(p) for p in self.points)
150        return f"P{self.count}={points}"

PDF uses a subset of the PostScript graphics language, which draws vector paths with various rendering options. We are only interested in the basic properties, in particular, for recognizing table cell borders.

This class specializes pypdfium2.PdfObject to add accessors for graphics containing vector paths of various configurations.

You must construct the paths by calling modm_data.pdf.page.Page.paths.

Path(obj)
49    def __init__(self, obj):
50        """
51        :param obj: PDF object of the path.
52        """
53        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
54        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH
55        self.type = pp.raw.FPDF_PAGEOBJ_PATH
Parameters
  • obj: PDF object of the path.
type
matrix: pypdfium2._helpers.matrix.PdfMatrix
57    @cached_property
58    def matrix(self) -> pp.PdfMatrix:
59        """The transformation matrix."""
60        return self.get_matrix()

The transformation matrix.

count: int
62    @cached_property
63    def count(self) -> int:
64        """Number of segments in this path."""
65        return pp.raw.FPDFPath_CountSegments(self)

Number of segments in this path.

fill: int
67    @cached_property
68    def fill(self) -> int:
69        """The fill color encoded as 32-bit RGBA."""
70        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
71        assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a)
72        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The fill color encoded as 32-bit RGBA.

stroke: int
74    @cached_property
75    def stroke(self) -> int:
76        """The stroke color encoded as 32-bit RGBA."""
77        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
78        assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a)
79        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The stroke color encoded as 32-bit RGBA.

width: float
81    @cached_property
82    def width(self) -> float:
83        """The stroke width."""
84        width = ctypes.c_float()
85        assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width)
86        return width.value

The stroke width.

cap: Path.Cap
88    @cached_property
89    def cap(self) -> Cap:
90        """Line cap type."""
91        return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self))

Line cap type.

join: Path.Join
93    @cached_property
94    def join(self) -> Join:
95        """Line join type."""
96        return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self))

Line join type.

bbox: modm_data.utils.Rectangle
 98    @cached_property
 99    def bbox(self) -> Rectangle:
100        """
101        Bounding box of the path.
102        .. warning::
103            The bounding is only approximated using the control points!
104            Therefore bezier curves will likely have a larger bounding box.
105        """
106        left, bottom = ctypes.c_float(), ctypes.c_float()
107        right, top = ctypes.c_float(), ctypes.c_float()
108        assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top)
109        bbox = Rectangle(left.value, bottom.value, right.value, top.value)
110        if self.page.rotation:
111            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
112        return bbox

Bounding box of the path.

The bounding is only approximated using the control points! Therefore bezier curves will likely have a larger bounding box.

points: list[modm_data.utils.Point]
114    @cached_property
115    def points(self) -> list[Point]:
116        """
117        List of points of the path. If the path is closed, the first point is
118        added to the end of the list.
119        """
120        points = []
121        for ii in range(self.count):
122            seg = pp.raw.FPDFPath_GetPathSegment(self, ii)
123            ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg))
124            # The first point should always be MOVETO
125            assert ii or ptype == Path.Type.MOVE
126
127            x, y = ctypes.c_float(), ctypes.c_float()
128            assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y)
129            x, y = self.matrix.on_point(x.value, y.value)
130            points.append(Point(x, y, type=ptype))
131
132            if pp.raw.FPDFPathSegment_GetClose(seg):
133                points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE))
134
135        if self.page.rotation:
136            points = [Point(y, self.page.height - x, type=p.type) for p in points]
137        return points

List of points of the path. If the path is closed, the first point is added to the end of the list.

lines: list[modm_data.utils.Line]
139    @cached_property
140    def lines(self) -> list[Line]:
141        """List of lines between the path points."""
142        points = self.points
143        return [
144            Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type)
145            for ii in range(len(points) - 1)
146        ]

List of lines between the path points.

Inherited Members
pypdfium2._helpers.pageobjects.PdfObject
parent
get_pos
get_matrix
set_matrix
transform
pypdfium2.internal.bases.AutoCloseable
close
class Path.Type(enum.Enum):
24    class Type(Enum):
25        """Path Type"""
26
27        LINE = 0
28        BEZIER = 1
29        MOVE = 2

Path Type

LINE = <Type.LINE: 0>
BEZIER = <Type.BEZIER: 1>
MOVE = <Type.MOVE: 2>
Inherited Members
enum.Enum
name
value
class Path.Cap(enum.Enum):
31    class Cap(Enum):
32        """Path Cap Type"""
33
34        BUTT = 0
35        ROUND = 1
36        PROJECTING_SQUARE = 2

Path Cap Type

BUTT = <Cap.BUTT: 0>
ROUND = <Cap.ROUND: 1>
PROJECTING_SQUARE = <Cap.PROJECTING_SQUARE: 2>
Inherited Members
enum.Enum
name
value
class Path.Join(enum.Enum):
38    class Join(Enum):
39        """Path Join Type"""
40
41        MITER = 0
42        ROUND = 1
43        BEVEL = 2

Path Join Type

MITER = <Join.MITER: 0>
ROUND = <Join.ROUND: 1>
BEVEL = <Join.BEVEL: 2>
Inherited Members
enum.Enum
name
value
class Image(pypdfium2._helpers.pageobjects.PdfImage):
10class Image(pp.PdfImage):
11    """
12    This class extends `pypdfium2.PdfImage` to align it with the interface of
13    the `Path` class so that it can be used in the same
14    algorithms without filtering.
15
16    You must construct the images by calling `modm_data.pdf.page.Page.images`.
17
18    .. note:: Images are currently ignored.
19    """
20
21    # Overwrite the PdfPageObject.__new__ function
22    def __new__(cls, *args, **kwargs):
23        return object.__new__(cls)
24
25    def __init__(self, obj):
26        """
27        :param obj: Page object of the image.
28        """
29        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
30        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
31        self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
32
33        self.count: int = 4
34        """Number of segments. Always 4 due to rectangular image form.
35           (For compatibility with `Path.count`.)"""
36        self.stroke: int = 0
37        """The border stroke color. Always 0.
38           (For compatibility with `Path.stroke`.)"""
39        self.fill: int = 0
40        """The image fill color. Always 0.
41           (For compatibility with `Path.fill`.)"""
42        self.width: float = 0
43        """The border line width. Always 0.
44           (For compatibility with `Path.width`.)"""
45
46    @cached_property
47    def matrix(self) -> pp.PdfMatrix:
48        """The transformation matrix."""
49        return self.get_matrix()
50
51    @cached_property
52    def bbox(self) -> Rectangle:
53        """The bounding box of the image."""
54        bbox = Rectangle(*self.get_pos())
55        if self.page.rotation:
56            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
57        return bbox
58
59    @cached_property
60    def points(self) -> list[Point]:
61        """
62        The 4 points of the bounding box.
63        (For compatibility with `Path.points`.)
64        """
65        points = self.bbox.points
66        if self.page.rotation:
67            points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
68        return points
69
70    @cached_property
71    def lines(self) -> list[Line]:
72        """
73        The 4 lines of the bounding box.
74        (For compatibility with `Path.lines`.)
75        """
76        p = self.points
77        return [
78            Line(p[0], p[1], p[1].type, 0),
79            Line(p[1], p[2], p[2].type, 0),
80            Line(p[2], p[3], p[3].type, 0),
81            Line(p[3], p[0], p[0].type, 0),
82        ]
83
84    def __repr__(self) -> str:
85        return f"I{self.bbox}"

This class extends pypdfium2.PdfImage to align it with the interface of the Path class so that it can be used in the same algorithms without filtering.

You must construct the images by calling modm_data.pdf.page.Page.images.

Images are currently ignored.
Image(obj)
25    def __init__(self, obj):
26        """
27        :param obj: Page object of the image.
28        """
29        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
30        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
31        self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
32
33        self.count: int = 4
34        """Number of segments. Always 4 due to rectangular image form.
35           (For compatibility with `Path.count`.)"""
36        self.stroke: int = 0
37        """The border stroke color. Always 0.
38           (For compatibility with `Path.stroke`.)"""
39        self.fill: int = 0
40        """The image fill color. Always 0.
41           (For compatibility with `Path.fill`.)"""
42        self.width: float = 0
43        """The border line width. Always 0.
44           (For compatibility with `Path.width`.)"""
Parameters
  • obj: Page object of the image.
type
count: int

Number of segments. Always 4 due to rectangular image form. (For compatibility with Path.count.)

stroke: int

The border stroke color. Always 0. (For compatibility with Path.stroke.)

fill: int

The image fill color. Always 0. (For compatibility with Path.fill.)

width: float

The border line width. Always 0. (For compatibility with Path.width.)

matrix: pypdfium2._helpers.matrix.PdfMatrix
46    @cached_property
47    def matrix(self) -> pp.PdfMatrix:
48        """The transformation matrix."""
49        return self.get_matrix()

The transformation matrix.

bbox: modm_data.utils.Rectangle
51    @cached_property
52    def bbox(self) -> Rectangle:
53        """The bounding box of the image."""
54        bbox = Rectangle(*self.get_pos())
55        if self.page.rotation:
56            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
57        return bbox

The bounding box of the image.

points: list[modm_data.utils.Point]
59    @cached_property
60    def points(self) -> list[Point]:
61        """
62        The 4 points of the bounding box.
63        (For compatibility with `Path.points`.)
64        """
65        points = self.bbox.points
66        if self.page.rotation:
67            points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
68        return points

The 4 points of the bounding box. (For compatibility with Path.points.)

lines: list[modm_data.utils.Line]
70    @cached_property
71    def lines(self) -> list[Line]:
72        """
73        The 4 lines of the bounding box.
74        (For compatibility with `Path.lines`.)
75        """
76        p = self.points
77        return [
78            Line(p[0], p[1], p[1].type, 0),
79            Line(p[1], p[2], p[2].type, 0),
80            Line(p[2], p[3], p[3].type, 0),
81            Line(p[3], p[0], p[0].type, 0),
82        ]

The 4 lines of the bounding box. (For compatibility with Path.lines.)

Inherited Members
pypdfium2._helpers.pageobjects.PdfImage
SIMPLE_FILTERS
new
get_metadata
get_size
load_jpeg
set_bitmap
get_bitmap
get_data
get_filters
extract
pypdfium2._helpers.pageobjects.PdfObject
parent
get_pos
get_matrix
set_matrix
transform
pypdfium2.internal.bases.AutoCloseable
close
class Structure:
 11class Structure:
 12    """
 13    A tagged PDF/UA (Universal Accessibility) contains the structure of content
 14    as a tree data structure with similar semantics to HTML. Sadly, the quality
 15    of the tags depends heavily on the PDF creation software. See
 16    [Overview of PDF tags](https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/).
 17
 18    An example of an accessible pdf that can be inspected via these classes:
 19    [Rock On, D.C. Music Festival](https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf).
 20
 21    This class is a convenience wrapper around [the pdfium structtree methods](
 22    https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h).
 23    """
 24
 25    def __init__(self, page: "modm_data.pdf.page.Page", element: pp.raw.FPDF_STRUCTELEMENT, parent: "Structure" = None):  # noqa: F821
 26        self._page = page
 27        self._element = element
 28        self.parent: Structure = weakref.ref(parent) if parent else None
 29        """The parent node."""
 30
 31    def _get_string(self, function) -> str:
 32        length = function(self._element, 0, 0)
 33        clength = ctypes.c_ulong(length)
 34        cbuffer = ctypes.create_string_buffer(length)
 35        function(self._element, cbuffer, clength)
 36        return bytes(cbuffer).decode("utf-16-le", errors="ignore")
 37
 38    @cached_property
 39    def title(self) -> str:
 40        """Title `/T`"""
 41        return self._get_string(pp.raw.FPDF_StructElement_GetTitle)
 42
 43    @cached_property
 44    def actual_text(self) -> str:
 45        """The actual text."""
 46        return self._get_string(pp.raw.FPDF_StructElement_GetActualText)
 47
 48    @cached_property
 49    def alt_text(self) -> str:
 50        """Alternate Text"""
 51        return self._get_string(pp.raw.FPDF_StructElement_GetAltText)
 52
 53    @cached_property
 54    def type(self) -> str:
 55        """Type `/S`"""
 56        return self._get_string(pp.raw.FPDF_StructElement_GetType)
 57
 58    @cached_property
 59    def obj_type(self) -> str:
 60        """Object Type `/Type`"""
 61        return self._get_string(pp.raw.FPDF_StructElement_GetObjType)
 62
 63    @cached_property
 64    def language(self) -> str:
 65        """The case-insensitive IETF BCP 47 language code."""
 66        return self._get_string(pp.raw.FPDF_StructElement_GetLang)
 67
 68    @cached_property
 69    def id(self) -> str:
 70        """Identifier"""
 71        return self._get_string(pp.raw.FPDF_StructElement_GetID)
 72
 73    @cached_property
 74    def marked_ids(self) -> list[int]:
 75        """List of marked content identifiers"""
 76        ids = []
 77        for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)):
 78            if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1:
 79                ids.append(mcid)
 80        return ids
 81
 82    @cached_property
 83    def attributes(self) -> dict[str, str | bool | float]:
 84        """
 85        All attributes of this structure element as a dictionary.
 86
 87        .. note::
 88            Due to limitations of the pdfium API, attribute arrays cannot be
 89            extracted! The values are marked as `[?]` in the dictionary.
 90        """
 91        kv = {}
 92        for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)):
 93            attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex)
 94            for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)):
 95                # Get the name
 96                clength = ctypes.c_ulong(0)
 97                cname = ctypes.create_string_buffer(1)  # workaround to get length
 98                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength)
 99                cname = ctypes.create_string_buffer(clength.value)
100                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength)
101                name = cname.raw.decode("utf-8", errors="ignore")
102
103                # Get the type
104                atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname)
105                assert atype != pp.raw.FPDF_OBJECT_UNKNOWN
106
107                # Then get each type individually
108                match atype:
109                    case pp.raw.FPDF_OBJECT_BOOLEAN:
110                        cbool = ctypes.bool()
111                        assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool)
112                        kv[name] = cbool.value
113
114                    case pp.raw.FPDF_OBJECT_NUMBER:
115                        cfloat = ctypes.c_float()
116                        assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat)
117                        kv[name] = cfloat.value
118
119                    case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME:
120                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength)
121                        cattrname = ctypes.create_string_buffer(clength.value * 2)
122                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength)
123                        kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1]
124
125                    # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed?
126                    # case pp.raw.FPDF_OBJECT_ARRAY:
127                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength)
128                    #     cblob = ctypes.create_string_buffer(clength.value)
129                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength)
130                    #     kv[name] = cblob.raw
131
132                    case pp.raw.FPDF_OBJECT_ARRAY:
133                        kv[name] = "[?]"
134
135                    case _:
136                        kv[name] = f"[unknown={atype}?]"
137        return kv
138
139    @cache
140    def child(self, index: int) -> "Structure":
141        """
142        :param index: 0-index of child.
143        :return: Child structure.
144        """
145        index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index)
146        return Structure(self._page, index, self)
147
148    @property
149    def children(self) -> list:
150        """All child structures."""
151        count = pp.raw.FPDF_StructElement_CountChildren(self._element)
152        for ii in range(count):
153            yield self.child(ii)
154
155    def descr(self, indent=0) -> str:
156        """Description including all children via indentation."""
157        string = " " * indent + repr(self) + "\n"
158        for child in self.children:
159            string += child.descr(indent + 4)
160        return string
161
162    def __repr__(self) -> str:
163        values = []
164        if self.type:
165            values.append(f"type={self.type}")
166        if self.title:
167            values.append(f"title={self.title}")
168        if self.actual_text:
169            values.append(f"act_text={self.actual_text}")
170        if self.alt_text:
171            values.append(f"alt_text={self.alt_text}")
172        if self.id:
173            values.append(f"id={self.id}")
174        values += [f"mid={i}" for i in self.marked_ids]
175        values += [f"{k}={v}" for k, v in self.attributes.items()]
176        return f"S({','.join(map(str, values))})"

A tagged PDF/UA (Universal Accessibility) contains the structure of content as a tree data structure with similar semantics to HTML. Sadly, the quality of the tags depends heavily on the PDF creation software. See Overview of PDF tags.

An example of an accessible pdf that can be inspected via these classes: Rock On, D.C. Music Festival.

This class is a convenience wrapper around the pdfium structtree methods.

Structure( page: Page, element: pypdfium2_raw.bindings.LP_struct_fpdf_structelement_t__, parent: Structure = None)
25    def __init__(self, page: "modm_data.pdf.page.Page", element: pp.raw.FPDF_STRUCTELEMENT, parent: "Structure" = None):  # noqa: F821
26        self._page = page
27        self._element = element
28        self.parent: Structure = weakref.ref(parent) if parent else None
29        """The parent node."""
parent: Structure

The parent node.

title: str
38    @cached_property
39    def title(self) -> str:
40        """Title `/T`"""
41        return self._get_string(pp.raw.FPDF_StructElement_GetTitle)

Title /T

actual_text: str
43    @cached_property
44    def actual_text(self) -> str:
45        """The actual text."""
46        return self._get_string(pp.raw.FPDF_StructElement_GetActualText)

The actual text.

alt_text: str
48    @cached_property
49    def alt_text(self) -> str:
50        """Alternate Text"""
51        return self._get_string(pp.raw.FPDF_StructElement_GetAltText)

Alternate Text

type: str
53    @cached_property
54    def type(self) -> str:
55        """Type `/S`"""
56        return self._get_string(pp.raw.FPDF_StructElement_GetType)

Type /S

obj_type: str
58    @cached_property
59    def obj_type(self) -> str:
60        """Object Type `/Type`"""
61        return self._get_string(pp.raw.FPDF_StructElement_GetObjType)

Object Type /Type

language: str
63    @cached_property
64    def language(self) -> str:
65        """The case-insensitive IETF BCP 47 language code."""
66        return self._get_string(pp.raw.FPDF_StructElement_GetLang)

The case-insensitive IETF BCP 47 language code.

id: str
68    @cached_property
69    def id(self) -> str:
70        """Identifier"""
71        return self._get_string(pp.raw.FPDF_StructElement_GetID)

Identifier

marked_ids: list[int]
73    @cached_property
74    def marked_ids(self) -> list[int]:
75        """List of marked content identifiers"""
76        ids = []
77        for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)):
78            if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1:
79                ids.append(mcid)
80        return ids

List of marked content identifiers

attributes: dict[str, str | bool | float]
 82    @cached_property
 83    def attributes(self) -> dict[str, str | bool | float]:
 84        """
 85        All attributes of this structure element as a dictionary.
 86
 87        .. note::
 88            Due to limitations of the pdfium API, attribute arrays cannot be
 89            extracted! The values are marked as `[?]` in the dictionary.
 90        """
 91        kv = {}
 92        for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)):
 93            attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex)
 94            for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)):
 95                # Get the name
 96                clength = ctypes.c_ulong(0)
 97                cname = ctypes.create_string_buffer(1)  # workaround to get length
 98                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength)
 99                cname = ctypes.create_string_buffer(clength.value)
100                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength)
101                name = cname.raw.decode("utf-8", errors="ignore")
102
103                # Get the type
104                atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname)
105                assert atype != pp.raw.FPDF_OBJECT_UNKNOWN
106
107                # Then get each type individually
108                match atype:
109                    case pp.raw.FPDF_OBJECT_BOOLEAN:
110                        cbool = ctypes.bool()
111                        assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool)
112                        kv[name] = cbool.value
113
114                    case pp.raw.FPDF_OBJECT_NUMBER:
115                        cfloat = ctypes.c_float()
116                        assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat)
117                        kv[name] = cfloat.value
118
119                    case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME:
120                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength)
121                        cattrname = ctypes.create_string_buffer(clength.value * 2)
122                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength)
123                        kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1]
124
125                    # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed?
126                    # case pp.raw.FPDF_OBJECT_ARRAY:
127                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength)
128                    #     cblob = ctypes.create_string_buffer(clength.value)
129                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength)
130                    #     kv[name] = cblob.raw
131
132                    case pp.raw.FPDF_OBJECT_ARRAY:
133                        kv[name] = "[?]"
134
135                    case _:
136                        kv[name] = f"[unknown={atype}?]"
137        return kv

All attributes of this structure element as a dictionary.

Due to limitations of the pdfium API, attribute arrays cannot be extracted! The values are marked as [?] in the dictionary.

@cache
def child(self, index: int) -> Structure:
139    @cache
140    def child(self, index: int) -> "Structure":
141        """
142        :param index: 0-index of child.
143        :return: Child structure.
144        """
145        index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index)
146        return Structure(self._page, index, self)
Parameters
  • index: 0-index of child.
Returns

Child structure.

children: list
148    @property
149    def children(self) -> list:
150        """All child structures."""
151        count = pp.raw.FPDF_StructElement_CountChildren(self._element)
152        for ii in range(count):
153            yield self.child(ii)

All child structures.

def descr(self, indent=0) -> str:
155    def descr(self, indent=0) -> str:
156        """Description including all children via indentation."""
157        string = " " * indent + repr(self) + "\n"
158        for child in self.children:
159            string += child.descr(indent + 4)
160        return string

Description including all children via indentation.