modm_data.pdf.page

PDF Pages

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4"""
  5# PDF Pages
  6
  7
  8"""
  9
 10import ctypes
 11import logging
 12import weakref
 13from typing import Iterator, Callable
 14from bisect import bisect_left, bisect_right
 15from functools import cached_property, cache
 16from collections import defaultdict, OrderedDict
 17import pypdfium2 as pp
 18
 19from ..utils import Rectangle, Region
 20from .character import Character
 21from .link import ObjLink, WebLink
 22from .graphics import Path, Image
 23from .structure import Structure
 24
 25LOGGER = logging.getLogger(__name__)
 26
 27
 28class Page(pp.PdfPage):
 29    """
 30    This class provides low-level access to graphics and characters of the page.
 31    It also fixes missing bounding boxes for rotates characters on page load,
 32    as well as allow searching for characters in an area instead of just text.
 33    """
 34    def __init__(self, document: "modm_data.pdf.Document", index: int):
 35        """
 36        :param document: a PDF document.
 37        :param index: 0-index page number.
 38        """
 39        self.index = index
 40        """0-index page number."""
 41        self.number = index + 1
 42        """1-index page number."""
 43
 44        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
 45        self._links = None
 46        self._weblinks = None
 47        self._linked = False
 48
 49        LOGGER.debug(f"Loading: {index}")
 50
 51        self._text = self.get_textpage()
 52        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
 53        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
 54        # close them in reverse order
 55        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
 56        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
 57
 58        self._fix_bboxes()
 59
 60    @cached_property
 61    def label(self) -> str:
 62        """The page label."""
 63        return self.pdf.get_page_label(self.index)
 64
 65    @cached_property
 66    def width(self) -> float:
 67        """The page width."""
 68        return self.get_width()
 69
 70    @cached_property
 71    def height(self) -> float:
 72        """The page height."""
 73        return self.get_height()
 74
 75    @cached_property
 76    def rotation(self) -> int:
 77        """The page rotation in degrees."""
 78        return self.get_rotation()
 79
 80    @cached_property
 81    def bbox(self) -> Rectangle:
 82        """The page bounding box."""
 83        return Rectangle(*self.get_bbox())
 84
 85    @cached_property
 86    def char_count(self) -> int:
 87        """The total count of characters."""
 88        return self._text.count_chars()
 89
 90    @cache
 91    def char(self, index: int) -> Character:
 92        """:return: The character at the 0-index."""
 93        return Character(self, index)
 94
 95    @property
 96    def chars(self) -> Iterator[Character]:
 97        """Yields all characters."""
 98        for ii in range(self.char_count):
 99            yield self.char(ii)
100
101    @cached_property
102    def objlinks(self) -> list[ObjLink]:
103        """All object links."""
104        links = []
105        pos = ctypes.c_int(0)
106        link = pp.raw.FPDF_LINK()
107        while pp.raw.FPDFLink_Enumerate(self, pos, link):
108            links.append(ObjLink(self, link))
109        return links
110
111    @cached_property
112    def weblinks(self) -> list[WebLink]:
113        """All web links."""
114        links = []
115        for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)):
116            links.append(WebLink(self, ii))
117        return links
118
119    def chars_in_area(self, area: Rectangle) -> list[Character]:
120        """
121        :param area: area to search for character in.
122        :return: All characters found in the area.
123        """
124        found = []
125        # We perform binary searches of the lower and upper y-positions first
126        # lines are ordered by y-position
127        ypositions = list(self._charlines.keys())
128        y_bottom = bisect_left(ypositions, area.bottom)
129        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
130
131        # Then for every line we do another binary search for left and right
132        for ypos in ypositions[y_bottom:y_top]:
133            chars = self._charlines[ypos]
134            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
135            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
136            # Finally we add all these characters
137            found.extend(chars[x_left:x_right])
138        return found
139
140    def text_in_area(self, area: Rectangle) -> str:
141        """
142        :param area: area to search for text in.
143        :return: Only the text found in the area.
144        """
145        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
146
147    @property
148    def structures(self) -> Iterator[Structure]:
149        """The PDF/UA tags."""
150        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
151        for ii in range(count):
152            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
153            yield Structure(self, child)
154
155    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
156        """
157        Searches for a match string as whole, consecutive words and yields the
158        characters.
159
160        :param string: The search string.
161        :param case_sensitive: Ignore case if false.
162        :return: yields the characters found.
163        """
164        searcher = self._text.search(string, match_case=case_sensitive,
165                                      match_whole_word=True, consecutive=True)
166        while idx := searcher.get_next():
167            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
168            yield chars
169
170    @cached_property
171    def paths(self) -> list[Path]:
172        """All paths."""
173        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
174
175    @cached_property
176    def images(self) -> list[Image]:
177        """All images."""
178        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
179
180    def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None,
181                         absolute_tolerance: float = None) -> \
182                                            list[tuple[Rectangle, list[Path]]]:
183        if absolute_tolerance is None:
184            absolute_tolerance = min(self.width, self.height) * 0.01
185
186        # First collect all vertical regions
187        filtered_paths = []
188        for path in self.paths:
189            if predicate is None or predicate(path):
190                filtered_paths.append(path)
191        for image in self.images:
192            if predicate is None or predicate(image):
193                filtered_paths.append(image)
194
195        regions = []
196        for path in sorted(filtered_paths, key=lambda l: l.bbox.y):
197            for reg in regions:
198                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
199                    # They overlap, so merge them
200                    reg.v0 = min(reg.v0, path.bbox.bottom)
201                    reg.v1 = max(reg.v1, path.bbox.top)
202                    reg.objs.append(path)
203                    break
204            else:
205                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
206
207        # Now collect horizontal region inside each vertical region
208        for yreg in regions:
209            for path in sorted(filtered_paths, key=lambda l: l.bbox.x):
210                # check if horizontal line is contained in vregion
211                if yreg.contains(path.bbox.y, absolute_tolerance):
212                    for xreg in yreg.subregions:
213                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
214                            # They overlap so merge them
215                            xreg.v0 = min(xreg.v0, path.bbox.left)
216                            xreg.v1 = max(xreg.v1, path.bbox.right)
217                            xreg.objs.append(path)
218                            break
219                    else:
220                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
221
222        clusters = []
223        for yreg in regions:
224            for xreg in yreg.subregions:
225                if len(yreg.subregions) > 1:
226                    # Strip down the height again for subregions
227                    y0, y1 = 1e9, 0
228                    for path in xreg.objs:
229                        y0 = min(y0, path.bbox.bottom)
230                        y1 = max(y1, path.bbox.top)
231                else:
232                    y0, y1 = yreg.v0, yreg.v1
233                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
234                clusters.append((bbox, xreg.objs))
235
236        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
237
238
239    def _link_characters(self):
240        if self._linked:
241            return
242        # The in-document links only gives us rectangles and we must find the
243        # linked chars ourselves
244        for link in self.objlinks:
245            for char in self.chars_in_area(link.bbox):
246                char.objlink = link
247        # The weblinks give you an explicit char range, very convenient
248        for link in self.weblinks:
249            for ii in range(*link.range):
250                self.char(ii).weblink = link
251        self._linked = True
252
253    @cached_property
254    def _charlines(self):
255        charlines = defaultdict(list)
256        for char in self.chars:
257            charlines[round(char.bbox.midpoint.y, 1)].append(char)
258
259        orderedchars = OrderedDict.fromkeys(sorted(charlines))
260        for ypos, chars in charlines.items():
261            orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x)
262
263        return orderedchars
264
265    def _fix_bboxes(self):
266        def _key(char):
267            height = round(char.tbbox.height, 1)
268            width = round(char.tbbox.width, 1)
269            return f"{char.font} {char.unicode} {height} {width}"
270        fix_chars = []
271        for char in self.chars:
272            if not char._bbox.width or not char._bbox.height:
273                if char._rotation:
274                    fix_chars.append(char)
275                elif char.unicode not in {0xa, 0xd}:
276                    fix_chars.append(char)
277            elif (char.unicode not in {0xa, 0xd} and not char._rotation and
278                  _key(char) not in self.pdf._bbox_cache):
279                bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation)
280                self.pdf._bbox_cache[_key(char)] = (char, bbox)
281                # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation)
282        for char in fix_chars:
283            bbox = self.pdf._bbox_cache.get(_key(char))
284            if bbox is not None:
285                # print("<-", char.descr(), char._rotation, char.rotation, char.height)
286                _, bbox = bbox
287                bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin)
288                char._bbox = bbox
289            elif char.unicode not in {0x20, 0xa, 0xd}:
290                LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
LOGGER = <Logger modm_data.pdf.page (WARNING)>
class Page(pypdfium2._helpers.page.PdfPage):
 29class Page(pp.PdfPage):
 30    """
 31    This class provides low-level access to graphics and characters of the page.
 32    It also fixes missing bounding boxes for rotates characters on page load,
 33    as well as allow searching for characters in an area instead of just text.
 34    """
 35    def __init__(self, document: "modm_data.pdf.Document", index: int):
 36        """
 37        :param document: a PDF document.
 38        :param index: 0-index page number.
 39        """
 40        self.index = index
 41        """0-index page number."""
 42        self.number = index + 1
 43        """1-index page number."""
 44
 45        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
 46        self._links = None
 47        self._weblinks = None
 48        self._linked = False
 49
 50        LOGGER.debug(f"Loading: {index}")
 51
 52        self._text = self.get_textpage()
 53        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
 54        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
 55        # close them in reverse order
 56        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
 57        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
 58
 59        self._fix_bboxes()
 60
 61    @cached_property
 62    def label(self) -> str:
 63        """The page label."""
 64        return self.pdf.get_page_label(self.index)
 65
 66    @cached_property
 67    def width(self) -> float:
 68        """The page width."""
 69        return self.get_width()
 70
 71    @cached_property
 72    def height(self) -> float:
 73        """The page height."""
 74        return self.get_height()
 75
 76    @cached_property
 77    def rotation(self) -> int:
 78        """The page rotation in degrees."""
 79        return self.get_rotation()
 80
 81    @cached_property
 82    def bbox(self) -> Rectangle:
 83        """The page bounding box."""
 84        return Rectangle(*self.get_bbox())
 85
 86    @cached_property
 87    def char_count(self) -> int:
 88        """The total count of characters."""
 89        return self._text.count_chars()
 90
 91    @cache
 92    def char(self, index: int) -> Character:
 93        """:return: The character at the 0-index."""
 94        return Character(self, index)
 95
 96    @property
 97    def chars(self) -> Iterator[Character]:
 98        """Yields all characters."""
 99        for ii in range(self.char_count):
100            yield self.char(ii)
101
102    @cached_property
103    def objlinks(self) -> list[ObjLink]:
104        """All object links."""
105        links = []
106        pos = ctypes.c_int(0)
107        link = pp.raw.FPDF_LINK()
108        while pp.raw.FPDFLink_Enumerate(self, pos, link):
109            links.append(ObjLink(self, link))
110        return links
111
112    @cached_property
113    def weblinks(self) -> list[WebLink]:
114        """All web links."""
115        links = []
116        for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)):
117            links.append(WebLink(self, ii))
118        return links
119
120    def chars_in_area(self, area: Rectangle) -> list[Character]:
121        """
122        :param area: area to search for character in.
123        :return: All characters found in the area.
124        """
125        found = []
126        # We perform binary searches of the lower and upper y-positions first
127        # lines are ordered by y-position
128        ypositions = list(self._charlines.keys())
129        y_bottom = bisect_left(ypositions, area.bottom)
130        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
131
132        # Then for every line we do another binary search for left and right
133        for ypos in ypositions[y_bottom:y_top]:
134            chars = self._charlines[ypos]
135            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
136            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
137            # Finally we add all these characters
138            found.extend(chars[x_left:x_right])
139        return found
140
141    def text_in_area(self, area: Rectangle) -> str:
142        """
143        :param area: area to search for text in.
144        :return: Only the text found in the area.
145        """
146        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
147
148    @property
149    def structures(self) -> Iterator[Structure]:
150        """The PDF/UA tags."""
151        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
152        for ii in range(count):
153            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
154            yield Structure(self, child)
155
156    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
157        """
158        Searches for a match string as whole, consecutive words and yields the
159        characters.
160
161        :param string: The search string.
162        :param case_sensitive: Ignore case if false.
163        :return: yields the characters found.
164        """
165        searcher = self._text.search(string, match_case=case_sensitive,
166                                      match_whole_word=True, consecutive=True)
167        while idx := searcher.get_next():
168            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
169            yield chars
170
171    @cached_property
172    def paths(self) -> list[Path]:
173        """All paths."""
174        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
175
176    @cached_property
177    def images(self) -> list[Image]:
178        """All images."""
179        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
180
181    def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None,
182                         absolute_tolerance: float = None) -> \
183                                            list[tuple[Rectangle, list[Path]]]:
184        if absolute_tolerance is None:
185            absolute_tolerance = min(self.width, self.height) * 0.01
186
187        # First collect all vertical regions
188        filtered_paths = []
189        for path in self.paths:
190            if predicate is None or predicate(path):
191                filtered_paths.append(path)
192        for image in self.images:
193            if predicate is None or predicate(image):
194                filtered_paths.append(image)
195
196        regions = []
197        for path in sorted(filtered_paths, key=lambda l: l.bbox.y):
198            for reg in regions:
199                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
200                    # They overlap, so merge them
201                    reg.v0 = min(reg.v0, path.bbox.bottom)
202                    reg.v1 = max(reg.v1, path.bbox.top)
203                    reg.objs.append(path)
204                    break
205            else:
206                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
207
208        # Now collect horizontal region inside each vertical region
209        for yreg in regions:
210            for path in sorted(filtered_paths, key=lambda l: l.bbox.x):
211                # check if horizontal line is contained in vregion
212                if yreg.contains(path.bbox.y, absolute_tolerance):
213                    for xreg in yreg.subregions:
214                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
215                            # They overlap so merge them
216                            xreg.v0 = min(xreg.v0, path.bbox.left)
217                            xreg.v1 = max(xreg.v1, path.bbox.right)
218                            xreg.objs.append(path)
219                            break
220                    else:
221                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
222
223        clusters = []
224        for yreg in regions:
225            for xreg in yreg.subregions:
226                if len(yreg.subregions) > 1:
227                    # Strip down the height again for subregions
228                    y0, y1 = 1e9, 0
229                    for path in xreg.objs:
230                        y0 = min(y0, path.bbox.bottom)
231                        y1 = max(y1, path.bbox.top)
232                else:
233                    y0, y1 = yreg.v0, yreg.v1
234                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
235                clusters.append((bbox, xreg.objs))
236
237        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
238
239
240    def _link_characters(self):
241        if self._linked:
242            return
243        # The in-document links only gives us rectangles and we must find the
244        # linked chars ourselves
245        for link in self.objlinks:
246            for char in self.chars_in_area(link.bbox):
247                char.objlink = link
248        # The weblinks give you an explicit char range, very convenient
249        for link in self.weblinks:
250            for ii in range(*link.range):
251                self.char(ii).weblink = link
252        self._linked = True
253
254    @cached_property
255    def _charlines(self):
256        charlines = defaultdict(list)
257        for char in self.chars:
258            charlines[round(char.bbox.midpoint.y, 1)].append(char)
259
260        orderedchars = OrderedDict.fromkeys(sorted(charlines))
261        for ypos, chars in charlines.items():
262            orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x)
263
264        return orderedchars
265
266    def _fix_bboxes(self):
267        def _key(char):
268            height = round(char.tbbox.height, 1)
269            width = round(char.tbbox.width, 1)
270            return f"{char.font} {char.unicode} {height} {width}"
271        fix_chars = []
272        for char in self.chars:
273            if not char._bbox.width or not char._bbox.height:
274                if char._rotation:
275                    fix_chars.append(char)
276                elif char.unicode not in {0xa, 0xd}:
277                    fix_chars.append(char)
278            elif (char.unicode not in {0xa, 0xd} and not char._rotation and
279                  _key(char) not in self.pdf._bbox_cache):
280                bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation)
281                self.pdf._bbox_cache[_key(char)] = (char, bbox)
282                # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation)
283        for char in fix_chars:
284            bbox = self.pdf._bbox_cache.get(_key(char))
285            if bbox is not None:
286                # print("<-", char.descr(), char._rotation, char.rotation, char.height)
287                _, bbox = bbox
288                bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin)
289                char._bbox = bbox
290            elif char.unicode not in {0x20, 0xa, 0xd}:
291                LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")

This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.

Page(document: modm_data.pdf.document.Document, index: int)
35    def __init__(self, document: "modm_data.pdf.Document", index: int):
36        """
37        :param document: a PDF document.
38        :param index: 0-index page number.
39        """
40        self.index = index
41        """0-index page number."""
42        self.number = index + 1
43        """1-index page number."""
44
45        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
46        self._links = None
47        self._weblinks = None
48        self._linked = False
49
50        LOGGER.debug(f"Loading: {index}")
51
52        self._text = self.get_textpage()
53        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
54        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
55        # close them in reverse order
56        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
57        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
58
59        self._fix_bboxes()
Parameters
  • document: a PDF document.
  • index: 0-index page number.
index

0-index page number.

number

1-index page number.

label: str
61    @cached_property
62    def label(self) -> str:
63        """The page label."""
64        return self.pdf.get_page_label(self.index)

The page label.

width: float
66    @cached_property
67    def width(self) -> float:
68        """The page width."""
69        return self.get_width()

The page width.

height: float
71    @cached_property
72    def height(self) -> float:
73        """The page height."""
74        return self.get_height()

The page height.

rotation: int
76    @cached_property
77    def rotation(self) -> int:
78        """The page rotation in degrees."""
79        return self.get_rotation()

The page rotation in degrees.

bbox: modm_data.utils.math.Rectangle
81    @cached_property
82    def bbox(self) -> Rectangle:
83        """The page bounding box."""
84        return Rectangle(*self.get_bbox())

The page bounding box.

char_count: int
86    @cached_property
87    def char_count(self) -> int:
88        """The total count of characters."""
89        return self._text.count_chars()

The total count of characters.

@cache
def char(self, index: int) -> modm_data.pdf.character.Character:
91    @cache
92    def char(self, index: int) -> Character:
93        """:return: The character at the 0-index."""
94        return Character(self, index)
Returns

The character at the 0-index.

chars: Iterator[modm_data.pdf.character.Character]
 96    @property
 97    def chars(self) -> Iterator[Character]:
 98        """Yields all characters."""
 99        for ii in range(self.char_count):
100            yield self.char(ii)

Yields all characters.

def chars_in_area( self, area: modm_data.utils.math.Rectangle) -> list[modm_data.pdf.character.Character]:
120    def chars_in_area(self, area: Rectangle) -> list[Character]:
121        """
122        :param area: area to search for character in.
123        :return: All characters found in the area.
124        """
125        found = []
126        # We perform binary searches of the lower and upper y-positions first
127        # lines are ordered by y-position
128        ypositions = list(self._charlines.keys())
129        y_bottom = bisect_left(ypositions, area.bottom)
130        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
131
132        # Then for every line we do another binary search for left and right
133        for ypos in ypositions[y_bottom:y_top]:
134            chars = self._charlines[ypos]
135            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
136            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
137            # Finally we add all these characters
138            found.extend(chars[x_left:x_right])
139        return found
Parameters
  • area: area to search for character in.
Returns

All characters found in the area.

def text_in_area(self, area: modm_data.utils.math.Rectangle) -> str:
141    def text_in_area(self, area: Rectangle) -> str:
142        """
143        :param area: area to search for text in.
144        :return: Only the text found in the area.
145        """
146        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
Parameters
  • area: area to search for text in.
Returns

Only the text found in the area.

structures: Iterator[modm_data.pdf.structure.Structure]
148    @property
149    def structures(self) -> Iterator[Structure]:
150        """The PDF/UA tags."""
151        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
152        for ii in range(count):
153            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
154            yield Structure(self, child)

The PDF/UA tags.

def find( self, string: str, case_sensitive: bool = True) -> Iterator[modm_data.pdf.character.Character]:
156    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
157        """
158        Searches for a match string as whole, consecutive words and yields the
159        characters.
160
161        :param string: The search string.
162        :param case_sensitive: Ignore case if false.
163        :return: yields the characters found.
164        """
165        searcher = self._text.search(string, match_case=case_sensitive,
166                                      match_whole_word=True, consecutive=True)
167        while idx := searcher.get_next():
168            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
169            yield chars

Searches for a match string as whole, consecutive words and yields the characters.

Parameters
  • string: The search string.
  • case_sensitive: Ignore case if false.
Returns

yields the characters found.

paths: list[modm_data.pdf.graphics.Path]
171    @cached_property
172    def paths(self) -> list[Path]:
173        """All paths."""
174        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]

All paths.

images: list[modm_data.pdf.graphics.Image]
176    @cached_property
177    def images(self) -> list[Image]:
178        """All images."""
179        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]

All images.

def graphic_clusters( self, predicate: Callable[[modm_data.pdf.graphics.Path | modm_data.pdf.graphics.Image], bool] = None, absolute_tolerance: float = None) -> list[tuple[modm_data.utils.math.Rectangle, list[modm_data.pdf.graphics.Path]]]:
181    def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None,
182                         absolute_tolerance: float = None) -> \
183                                            list[tuple[Rectangle, list[Path]]]:
184        if absolute_tolerance is None:
185            absolute_tolerance = min(self.width, self.height) * 0.01
186
187        # First collect all vertical regions
188        filtered_paths = []
189        for path in self.paths:
190            if predicate is None or predicate(path):
191                filtered_paths.append(path)
192        for image in self.images:
193            if predicate is None or predicate(image):
194                filtered_paths.append(image)
195
196        regions = []
197        for path in sorted(filtered_paths, key=lambda l: l.bbox.y):
198            for reg in regions:
199                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
200                    # They overlap, so merge them
201                    reg.v0 = min(reg.v0, path.bbox.bottom)
202                    reg.v1 = max(reg.v1, path.bbox.top)
203                    reg.objs.append(path)
204                    break
205            else:
206                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
207
208        # Now collect horizontal region inside each vertical region
209        for yreg in regions:
210            for path in sorted(filtered_paths, key=lambda l: l.bbox.x):
211                # check if horizontal line is contained in vregion
212                if yreg.contains(path.bbox.y, absolute_tolerance):
213                    for xreg in yreg.subregions:
214                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
215                            # They overlap so merge them
216                            xreg.v0 = min(xreg.v0, path.bbox.left)
217                            xreg.v1 = max(xreg.v1, path.bbox.right)
218                            xreg.objs.append(path)
219                            break
220                    else:
221                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
222
223        clusters = []
224        for yreg in regions:
225            for xreg in yreg.subregions:
226                if len(yreg.subregions) > 1:
227                    # Strip down the height again for subregions
228                    y0, y1 = 1e9, 0
229                    for path in xreg.objs:
230                        y0 = min(y0, path.bbox.bottom)
231                        y1 = max(y1, path.bbox.top)
232                else:
233                    y0, y1 = yreg.v0, yreg.v1
234                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
235                clusters.append((bbox, xreg.objs))
236
237        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
Inherited Members
pypdfium2._helpers.page.PdfPage
parent
get_width
get_height
get_size
get_rotation
set_rotation
get_mediabox
set_mediabox
get_cropbox
set_cropbox
get_bleedbox
set_bleedbox
get_trimbox
set_trimbox
get_artbox
set_artbox
get_bbox
get_textpage
insert_obj
remove_obj
gen_content
get_objects
render
pypdfium2.internal.bases.AutoCloseable
close