modm_data.pdf.page

PDF Pages

View Source
  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4"""
  5# PDF Pages
  6
  7
  8"""
  9
 10import ctypes
 11import logging
 12import weakref
 13from typing import Iterator, Callable
 14from bisect import bisect_left, bisect_right
 15from functools import cached_property, cache
 16from collections import defaultdict, OrderedDict
 17import pypdfium2 as pp
 18
 19from ..utils import Rectangle, Region
 20from .character import Character
 21from .link import ObjLink, WebLink
 22from .graphics import Path, Image
 23from .structure import Structure
 24
 25LOGGER = logging.getLogger(__name__)
 26
 27
 28class Page(pp.PdfPage):
 29    """
 30    This class provides low-level access to graphics and characters of the page.
 31    It also fixes missing bounding boxes for rotates characters on page load,
 32    as well as allow searching for characters in an area instead of just text.
 33    """
 34    def __init__(self, document: "modm_data.pdf.Document", index: int):
 35        """
 36        :param document: a PDF document.
 37        :param index: 0-index page number.
 38        """
 39        self.index = index
 40        """0-index page number."""
 41        self.number = index + 1
 42        """1-index page number."""
 43
 44        super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv)
 45        self._links = None
 46        self._weblinks = None
 47        self._linked = False
 48
 49        LOGGER.debug(f"Loading: {index}")
 50
 51        self._text = self.get_textpage()
 52        self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
 53        self._structtree = pp.raw.FPDF_StructTree_GetForPage(self)
 54        # close them in reverse order
 55        weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree)
 56        weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage)
 57
 58        self._fix_bboxes()
 59
 60    @cached_property
 61    def label(self) -> str:
 62        """The page label."""
 63        return self.pdf.get_page_label(self.index)
 64
 65    @cached_property
 66    def width(self) -> float:
 67        """The page width."""
 68        return self.get_width()
 69
 70    @cached_property
 71    def height(self) -> float:
 72        """The page height."""
 73        return self.get_height()
 74
 75    @cached_property
 76    def rotation(self) -> int:
 77        """The page rotation in degrees."""
 78        return self.get_rotation()
 79
 80    @cached_property
 81    def bbox(self) -> Rectangle:
 82        """The page bounding box."""
 83        return Rectangle(*self.get_bbox())
 84
 85    @cached_property
 86    def char_count(self) -> int:
 87        """The total count of characters."""
 88        return self._text.count_chars()
 89
 90    @cache
 91    def char(self, index: int) -> Character:
 92        """:return: The character at the 0-index."""
 93        return Character(self, index)
 94
 95    @property
 96    def chars(self) -> Iterator[Character]:
 97        """Yields all characters."""
 98        for ii in range(self.char_count):
 99            yield self.char(ii)
100
101    @cached_property
102    def objlinks(self) -> list[ObjLink]:
103        """All object links."""
104        links = []
105        pos = ctypes.c_int(0)
106        link = pp.raw.FPDF_LINK()
107        while pp.raw.FPDFLink_Enumerate(self, pos, link):
108            links.append(ObjLink(self, link))
109        return links
110
111    @cached_property
112    def weblinks(self) -> list[WebLink]:
113        """All web links."""
114        links = []
115        for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)):
116            links.append(WebLink(self, ii))
117        return links
118
119    def chars_in_area(self, area: Rectangle) -> list[Character]:
120        """
121        :param area: area to search for character in.
122        :return: All characters found in the area.
123        """
124        found = []
125        # We perform binary searches of the lower and upper y-positions first
126        # lines are ordered by y-position
127        ypositions = list(self._charlines.keys())
128        y_bottom = bisect_left(ypositions, area.bottom)
129        y_top = bisect_right(ypositions, area.top, lo=y_bottom)
130
131        # Then for every line we do another binary search for left and right
132        for ypos in ypositions[y_bottom:y_top]:
133            chars = self._charlines[ypos]
134            x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x)
135            x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x)
136            # Finally we add all these characters
137            found.extend(chars[x_left:x_right])
138        return found
139
140    def text_in_area(self, area: Rectangle) -> str:
141        """
142        :param area: area to search for text in.
143        :return: Only the text found in the area.
144        """
145        return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
146
147    @property
148    def structures(self) -> Iterator[Structure]:
149        """The PDF/UA tags."""
150        count = pp.raw.FPDF_StructTree_CountChildren(self._structtree)
151        for ii in range(count):
152            child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii)
153            yield Structure(self, child)
154
155    def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]:
156        """
157        Searches for a match string as whole, consecutive words and yields the
158        characters.
159
160        :param string: The search string.
161        :param case_sensitive: Ignore case if false.
162        :return: yields the characters found.
163        """
164        searcher = self._text.search(string, match_case=case_sensitive,
165                                      match_whole_word=True, consecutive=True)
166        while idx := searcher.get_next():
167            chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])]
168            yield chars
169
170    @cached_property
171    def paths(self) -> list[Path]:
172        """All paths."""
173        return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
174
175    @cached_property
176    def images(self) -> list[Image]:
177        """All images."""
178        return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
179
180    def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None,
181                         absolute_tolerance: float = None) -> \
182                                            list[tuple[Rectangle, list[Path]]]:
183        if absolute_tolerance is None:
184            absolute_tolerance = min(self.width, self.height) * 0.01
185
186        # First collect all vertical regions
187        filtered_paths = []
188        for path in self.paths:
189            if predicate is None or predicate(path):
190                filtered_paths.append(path)
191        for image in self.images:
192            if predicate is None or predicate(image):
193                filtered_paths.append(image)
194
195        regions = []
196        for path in sorted(filtered_paths, key=lambda l: l.bbox.y):
197            for reg in regions:
198                if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance):
199                    # They overlap, so merge them
200                    reg.v0 = min(reg.v0, path.bbox.bottom)
201                    reg.v1 = max(reg.v1, path.bbox.top)
202                    reg.objs.append(path)
203                    break
204            else:
205                regions.append(Region(path.bbox.bottom, path.bbox.top, path))
206
207        # Now collect horizontal region inside each vertical region
208        for yreg in regions:
209            for path in sorted(filtered_paths, key=lambda l: l.bbox.x):
210                # check if horizontal line is contained in vregion
211                if yreg.contains(path.bbox.y, absolute_tolerance):
212                    for xreg in yreg.subregions:
213                        if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance):
214                            # They overlap so merge them
215                            xreg.v0 = min(xreg.v0, path.bbox.left)
216                            xreg.v1 = max(xreg.v1, path.bbox.right)
217                            xreg.objs.append(path)
218                            break
219                    else:
220                        yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path))
221
222        clusters = []
223        for yreg in regions:
224            for xreg in yreg.subregions:
225                if len(yreg.subregions) > 1:
226                    # Strip down the height again for subregions
227                    y0, y1 = 1e9, 0
228                    for path in xreg.objs:
229                        y0 = min(y0, path.bbox.bottom)
230                        y1 = max(y1, path.bbox.top)
231                else:
232                    y0, y1 = yreg.v0, yreg.v1
233                bbox = Rectangle(xreg.v0, y0, xreg.v1, y1)
234                clusters.append((bbox, xreg.objs))
235
236        return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
237
238
239    def _link_characters(self):
240        if self._linked:
241            return
242        # The in-document links only gives us rectangles and we must find the
243        # linked chars ourselves
244        for link in self.objlinks:
245            for char in self.chars_in_area(link.bbox):
246                char.objlink = link
247        # The weblinks give you an explicit char range, very convenient
248        for link in self.weblinks:
249            for ii in range(*link.range):
250                self.char(ii).weblink = link
251        self._linked = True
252
253    @cached_property
254    def _charlines(self):
255        charlines = defaultdict(list)
256        for char in self.chars:
257            charlines[round(char.bbox.midpoint.y, 1)].append(char)
258
259        orderedchars = OrderedDict.fromkeys(sorted(charlines))
260        for ypos, chars in charlines.items():
261            orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x)
262
263        return orderedchars
264
265    def _fix_bboxes(self):
266        def _key(char):
267            height = round(char.tbbox.height, 1)
268            width = round(char.tbbox.width, 1)
269            return f"{char.font} {char.unicode} {height} {width}"
270        fix_chars = []
271        for char in self.chars:
272            if not char._bbox.width or not char._bbox.height:
273                if char._rotation:
274                    fix_chars.append(char)
275                elif char.unicode not in {0xa, 0xd}:
276                    fix_chars.append(char)
277            elif (char.unicode not in {0xa, 0xd} and not char._rotation and
278                  _key(char) not in self.pdf._bbox_cache):
279                bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation)
280                self.pdf._bbox_cache[_key(char)] = (char, bbox)
281                # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation)
282        for char in fix_chars:
283            bbox = self.pdf._bbox_cache.get(_key(char))
284            if bbox is not None:
285                # print("<-", char.descr(), char._rotation, char.rotation, char.height)
286                _, bbox = bbox
287                bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin)
288                char._bbox = bbox
289            elif char.unicode not in {0x20, 0xa, 0xd}:
290                LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
LOGGER = <Logger modm_data.pdf.page (WARNING)>
modm_data.pdf.page

PDF Pages

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Inherited Members