modm_data.pdf2html.page

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4import logging
  5import statistics
  6from typing import Callable
  7from functools import cached_property
  8from collections import defaultdict
  9from .table import Table
 10from .figure import Figure
 11from .line import CharLine
 12from ..utils import Rectangle, Region
 13from ..pdf import Page as PdfPage, Character
 14from anytree import Node
 15
 16
 17_LOGGER = logging.getLogger(__name__)
 18
 19
 20class Page(PdfPage):
 21    def __init__(self, document, index: int):
 22        super().__init__(document, index)
 23        self._template = "default"
 24        self.is_relevant: bool = True
 25        """Is this page relevant for the conversion?"""
 26
 27    def _unicode_filter(self, code: int) -> int:
 28        return code
 29
 30    @cached_property
 31    def _spacing(self) -> dict[str, float]:
 32        content = 0.1
 33        return {
 34            # Horizontal spacing: left->right
 35            "x_em": 0.01 * self.width,
 36            "x_left": content * self.width,
 37            "x_right": (1 - content) * self.width,
 38            "x_content": 0.2 * self.width,
 39            # Vertical spacing: bottom->top
 40            "y_em": 0.01 * self.height,
 41            # Max table line thickness
 42            "y_tline": 0.005 * self.height,
 43            # Max line height distance to detect paragraphs
 44            "lh": 0.9,
 45            # Max line height distance to detect super-/subscript
 46            "sc": 0.3,
 47            # Table header cell bold text threshold
 48            "th": 0.3,
 49        }
 50
 51    def _line_size(self, line: CharLine) -> str:
 52        rsize = line.height
 53        if rsize >= 17.5:
 54            return "h1"
 55        elif rsize >= 15.5:
 56            return "h2"
 57        elif rsize >= 13.5:
 58            return "h3"
 59        elif rsize >= 11.4:
 60            return "h4"
 61        elif rsize >= 8.5:
 62            return "n"
 63        else:
 64            return "fn"
 65
 66    def _colors(self, color: int) -> str:
 67        if 0xFF <= color <= 0xFF:
 68            return "black"
 69        if 0xFFFFFFFF <= color <= 0xFFFFFFFF:
 70            return "white"
 71        return "unknown"
 72
 73    @cached_property
 74    def _areas(self) -> dict[str, list[Rectangle] | Rectangle]:
 75        content = Rectangle(0.1, 0.1, 0.9, 0.9)
 76        areas = {"content": [content]}
 77        scaled_areas = {}
 78
 79        def _s(r):
 80            return Rectangle(r.left * self.width, r.bottom * self.height, r.right * self.width, r.top * self.height)
 81
 82        for name, area in areas.items():
 83            scaled_areas[name] = [_s(r) for r in area] if isinstance(area, list) else _s(area)
 84        return scaled_areas
 85
 86    def _char_properties(self, line, char):
 87        cp = {
 88            "superscript": False,
 89            "subscript": False,
 90            "bold": any(frag in char.font for frag in {"Bold"}),
 91            "italic": any(frag in char.font for frag in {"Italic", "Oblique"}),
 92            "underline": (char.objlink or char.weblink) is not None,
 93            "size": round(line.height),
 94            "relsize": self._line_size(line),
 95            "char": chr(char.unicode),
 96        }
 97        if line.rotation:
 98            if char.origin.x < (line.origin - 0.25 * line.height):
 99                cp["superscript"] = True
100            elif char.origin.x > (line.origin + 0.15 * line.height):
101                cp["subscript"] = True
102        elif char.origin.y > (line.origin + 0.25 * line.height):
103            cp["superscript"] = True
104        elif char.origin.y < (line.origin - 0.15 * line.height):
105            cp["subscript"] = True
106        return cp
107
108    def text_in_named_area(self, name: str, check_length: bool = True) -> str | None:
109        """
110        Find all text in the named area.
111
112        :param name: the name of the area(s) to query.
113        :param check_length: assert that the text has a length.
114        :return: the concatenated text of the named area(s) or `None` if area not found.
115        """
116        if name not in self._areas:
117            return None
118        text = ""
119        areas = self._areas[name]
120        if not isinstance(areas, list):
121            areas = [areas]
122        for area in areas:
123            text += self.text_in_area(area)
124        if check_length:
125            assert text
126        return text
127
128    def charlines_in_area(
129        self, area: Rectangle, predicate: Callable[[Character], bool] = None, rtol: float = None
130    ) -> list[CharLine]:
131        """
132        Coalesce the characters in the area and predicate into lines.
133
134        1. Every character in the area is filtered by the `predicate`.
135        2. Character orientation is split into horizontal (left->right) and
136           vertical (bottom->top) character lines sorted by x or y position.
137           Lines containing only whitespace are discarded.
138        3. Overlapping character lines are merged into sub- and superscript
139           using `rtol * max(current_line.height, next_line.height)` as the
140           tolerance for checking if the lines overlap.
141        4. The characters in the merged lines are re-sorted by origin.
142
143        :param area: Area to search for characters.
144        :param predicate: Function to discard characters in the area or include all by default.
145        :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default.
146        :return: A list of character lines sorted by x or y position.
147        """
148        if rtol is None:
149            rtol = self._spacing["sc"]
150        # Split all chars into lines based on rounded origin
151        origin_lines_y = defaultdict(list)
152        origin_lines_x = defaultdict(list)
153        for char in self.chars_in_area(area):
154            # Ignore all characters we don't want
155            if predicate is not None and not predicate(char):
156                continue
157            cunicode = self._unicode_filter(char.unicode)
158            if cunicode is None:
159                continue
160            char.unicode = cunicode
161            if char.unicode < 32 and char.unicode not in {0xA}:
162                continue
163            # Ignore characters without width that are not spaces
164            if not char.width and char.unicode not in {0xA, 0xD, 0x20}:
165                _LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
166            # Split up the chars depending on the orientation
167            if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
168                origin_lines_x[round(char.origin.x, 1)].append(char)
169            elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
170                origin_lines_y[round(char.origin.y, 1)].append(char)
171            else:
172                _LOGGER.error("Unknown char rotation:", char, char.rotation)
173
174        # Convert characters into lines
175        bbox_lines_y = []
176        for chars in origin_lines_y.values():
177            # Remove lines with whitespace only
178            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
179                continue
180            origin = statistics.fmean(c.origin.y for c in chars)
181            line = CharLine(
182                self,
183                chars,
184                min(c.bbox.bottom for c in chars),
185                origin,
186                max(c.bbox.top for c in chars),
187                max(c.height for c in chars),
188                sort_origin=self.height - origin,
189            )
190            bbox_lines_y.append(line)
191            # print(line, line.top, line.origin, line.bottom, line.height)
192        bbox_lines = sorted(bbox_lines_y, key=lambda line: line._sort_origin)
193
194        bbox_lines_x = []
195        for chars in origin_lines_x.values():
196            # Remove lines with whitespace only
197            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
198                continue
199            line = CharLine(
200                self,
201                chars,
202                min(c.bbox.left for c in chars),
203                statistics.fmean(c.origin.x for c in chars),
204                max(c.bbox.right for c in chars),
205                max(c.width for c in chars),
206                270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90,
207            )
208            bbox_lines_x.append(line)
209        bbox_lines += sorted(bbox_lines_x, key=lambda line: line._sort_origin)
210
211        if not bbox_lines:
212            return []
213
214        # Merge lines that have overlapping bbox_lines
215        # FIXME: This merges lines that "collide" vertically like in formulas
216        merged_lines = []
217        current_line = bbox_lines[0]
218        for next_line in bbox_lines[1:]:
219            height = max(current_line.height, next_line.height)
220            # Calculate overlap via normalize origin (increasing with line index)
221            if (current_line._sort_origin + rtol * height) > (next_line._sort_origin - rtol * height):
222                # if line.rotation or self.rotation:
223                #     # The next line overlaps this one, we merge the shorter line
224                #     # (typically super- and subscript) into taller line
225                #     use_current = len(current_line.chars) >= len(next_line.chars)
226                # else:
227                use_current = current_line.height >= next_line.height
228                line = current_line if use_current else next_line
229                current_line = CharLine(
230                    self,
231                    current_line.chars + next_line.chars,
232                    line.bottom,
233                    line.origin,
234                    line.top,
235                    height,
236                    line.rotation,
237                    sort_origin=line._sort_origin,
238                )
239            else:
240                # The next line does not overlap the current line
241                merged_lines.append(current_line)
242                current_line = next_line
243        # append last line
244        merged_lines.append(current_line)
245
246        # Sort all lines horizontally based on character origin
247        sorted_lines = []
248        for line in merged_lines:
249            if line.rotation == 90:
250
251                def sort_key(char):
252                    if char.unicode in {0xA, 0xD}:
253                        return char.tbbox.midpoint.y - 1e9
254                    return char.tbbox.midpoint.y
255            elif line.rotation == 270:
256
257                def sort_key(char):
258                    if char.unicode in {0xA, 0xD}:
259                        return -char.tbbox.midpoint.y + 1e9
260                    return -char.tbbox.midpoint.y
261            else:
262
263                def sort_key(char):
264                    if char.unicode in {0xA, 0xD}:
265                        return char.origin.x + 1e9
266                    return char.origin.x
267
268            sorted_lines.append(
269                CharLine(
270                    self,
271                    sorted(line.chars, key=sort_key),
272                    line.bottom,
273                    line.origin,
274                    line.top,
275                    line.height,
276                    line.rotation,
277                    area.left,
278                    sort_origin=line._sort_origin,
279                )
280            )
281
282        return sorted_lines
283
284    def graphic_bboxes_in_area(
285        self, area: Rectangle, with_graphics: bool = True
286    ) -> list[tuple[Rectangle, Table | Figure | None]]:
287        """
288        Coalesce the graphics in the area into full width bounding boxes.
289
290        1. Group vertically overlapping graphics.
291        2. Widen the overlapped graphics bounding boxes to the edges of the area.
292
293        :param area: area to search for content.
294        :param with_graphics: search for graphics in the area.
295        :return: list of tuples (bounding box, graphic objects or `None`).
296        """
297        if with_graphics:
298            graphics = self.graphics_in_area(area)
299            regions = []
300            # Check if graphics bounding boxes overlap vertically and group them
301            for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
302                gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
303                for reg in regions:
304                    if reg.overlaps(gbbox.bottom, gbbox.top):
305                        # They overlap, so merge them
306                        reg.v0 = min(reg.v0, gbbox.bottom)
307                        reg.v1 = max(reg.v1, gbbox.top)
308                        reg.objs.append(graphic)
309                        break
310                else:
311                    regions.append(Region(gbbox.bottom, gbbox.top, graphic))
312
313            # print(regions)
314            # Coalesce all overlapped graphics objects into full width areas
315            areas = []
316            ypos = area.top
317            for reg in regions:
318                if ypos - reg.v1 > self._spacing["y_em"]:
319                    areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
320                for obj in reg.objs:
321                    oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
322                    areas.append((oarea, obj))
323                ypos = reg.v0
324            areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
325        else:
326            areas = [(area, None)]
327        return areas
328
329    def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]:
330        """
331        Find all content objects in this area.
332
333        :param area: area to search for content.
334        :param with_graphics: search for graphics in the area.
335        :return: list of content objects sorted top to bottom.
336        """
337        self._link_characters()
338        areas = self.graphic_bboxes_in_area(area, with_graphics)
339        objects = []
340        for narea, obj in areas:
341            if obj is None:
342                objects += self.charlines_in_area(narea)
343            else:
344                oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
345
346                def predicate(c):
347                    return not obj.bbox.contains(c.origin)
348
349                lines = self.charlines_in_area(oarea, predicate)
350                # print(obj, oarea, lines, [line.content for line in lines])
351                objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
352        return objects
353
354    def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
355        """
356        Find all tables and figures in this area.
357
358        :param area: area to search for graphics.
359        :return: list of tables and figures.
360        """
361        return []
362
363    def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node:
364        """
365        Convert the area content into an abstract syntax tree.
366
367        :param area: area to search for content.
368        :param with_graphics: including graphics in the area.
369        :return: An abstract syntax tree including the content formatting.
370        """
371        return Node("area", obj=area, xpos=int(area.left), page=self)
372
373    @property
374    def content_ast(self) -> list[Node]:
375        """The abstract syntax trees in the content area."""
376        ast = []
377        with_graphics = True
378        for area in self._areas["content"]:
379            ast.append(self.ast_in_area(area, with_graphics=with_graphics))
380        # Add a page node to the first leaf to keep track of where a page starts
381        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
382        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
383        return ast
384
385    @property
386    def content_objects(self) -> list[CharLine | Table | Figure]:
387        """All objects in the content areas."""
388        objs = []
389        for area in self._areas["content"]:
390            objs.extend(self.objects_in_area(area))
391        return objs
392
393    @property
394    def content_graphics(self) -> list[Table | Figure]:
395        """All graphics in the content areas."""
396        objs = []
397        for area in self._areas["content"]:
398            objs.extend(self.graphics_in_area(area))
399        return objs
400
401    @property
402    def content_lines(self) -> list[CharLine]:
403        """All lines in the content areas."""
404        objs = []
405        for area in self._areas["content"]:
406            objs.extend(self.charlines_in_area(area))
407        return objs
408
409    @property
410    def content_tables(self) -> list[Table]:
411        """All tables in the content areas."""
412        return [o for o in self.content_graphics if isinstance(o, Table)]
413
414    @property
415    def content_figures(self) -> list[Figure]:
416        """All figures in the content areas."""
417        return [o for o in self.content_graphics if isinstance(o, Figure)]
418
419    def __repr__(self) -> str:
420        return f"Page({self.number})"
class Page(modm_data.pdf.page.Page):
 21class Page(PdfPage):
 22    def __init__(self, document, index: int):
 23        super().__init__(document, index)
 24        self._template = "default"
 25        self.is_relevant: bool = True
 26        """Is this page relevant for the conversion?"""
 27
 28    def _unicode_filter(self, code: int) -> int:
 29        return code
 30
 31    @cached_property
 32    def _spacing(self) -> dict[str, float]:
 33        content = 0.1
 34        return {
 35            # Horizontal spacing: left->right
 36            "x_em": 0.01 * self.width,
 37            "x_left": content * self.width,
 38            "x_right": (1 - content) * self.width,
 39            "x_content": 0.2 * self.width,
 40            # Vertical spacing: bottom->top
 41            "y_em": 0.01 * self.height,
 42            # Max table line thickness
 43            "y_tline": 0.005 * self.height,
 44            # Max line height distance to detect paragraphs
 45            "lh": 0.9,
 46            # Max line height distance to detect super-/subscript
 47            "sc": 0.3,
 48            # Table header cell bold text threshold
 49            "th": 0.3,
 50        }
 51
 52    def _line_size(self, line: CharLine) -> str:
 53        rsize = line.height
 54        if rsize >= 17.5:
 55            return "h1"
 56        elif rsize >= 15.5:
 57            return "h2"
 58        elif rsize >= 13.5:
 59            return "h3"
 60        elif rsize >= 11.4:
 61            return "h4"
 62        elif rsize >= 8.5:
 63            return "n"
 64        else:
 65            return "fn"
 66
 67    def _colors(self, color: int) -> str:
 68        if 0xFF <= color <= 0xFF:
 69            return "black"
 70        if 0xFFFFFFFF <= color <= 0xFFFFFFFF:
 71            return "white"
 72        return "unknown"
 73
 74    @cached_property
 75    def _areas(self) -> dict[str, list[Rectangle] | Rectangle]:
 76        content = Rectangle(0.1, 0.1, 0.9, 0.9)
 77        areas = {"content": [content]}
 78        scaled_areas = {}
 79
 80        def _s(r):
 81            return Rectangle(r.left * self.width, r.bottom * self.height, r.right * self.width, r.top * self.height)
 82
 83        for name, area in areas.items():
 84            scaled_areas[name] = [_s(r) for r in area] if isinstance(area, list) else _s(area)
 85        return scaled_areas
 86
 87    def _char_properties(self, line, char):
 88        cp = {
 89            "superscript": False,
 90            "subscript": False,
 91            "bold": any(frag in char.font for frag in {"Bold"}),
 92            "italic": any(frag in char.font for frag in {"Italic", "Oblique"}),
 93            "underline": (char.objlink or char.weblink) is not None,
 94            "size": round(line.height),
 95            "relsize": self._line_size(line),
 96            "char": chr(char.unicode),
 97        }
 98        if line.rotation:
 99            if char.origin.x < (line.origin - 0.25 * line.height):
100                cp["superscript"] = True
101            elif char.origin.x > (line.origin + 0.15 * line.height):
102                cp["subscript"] = True
103        elif char.origin.y > (line.origin + 0.25 * line.height):
104            cp["superscript"] = True
105        elif char.origin.y < (line.origin - 0.15 * line.height):
106            cp["subscript"] = True
107        return cp
108
109    def text_in_named_area(self, name: str, check_length: bool = True) -> str | None:
110        """
111        Find all text in the named area.
112
113        :param name: the name of the area(s) to query.
114        :param check_length: assert that the text has a length.
115        :return: the concatenated text of the named area(s) or `None` if area not found.
116        """
117        if name not in self._areas:
118            return None
119        text = ""
120        areas = self._areas[name]
121        if not isinstance(areas, list):
122            areas = [areas]
123        for area in areas:
124            text += self.text_in_area(area)
125        if check_length:
126            assert text
127        return text
128
129    def charlines_in_area(
130        self, area: Rectangle, predicate: Callable[[Character], bool] = None, rtol: float = None
131    ) -> list[CharLine]:
132        """
133        Coalesce the characters in the area and predicate into lines.
134
135        1. Every character in the area is filtered by the `predicate`.
136        2. Character orientation is split into horizontal (left->right) and
137           vertical (bottom->top) character lines sorted by x or y position.
138           Lines containing only whitespace are discarded.
139        3. Overlapping character lines are merged into sub- and superscript
140           using `rtol * max(current_line.height, next_line.height)` as the
141           tolerance for checking if the lines overlap.
142        4. The characters in the merged lines are re-sorted by origin.
143
144        :param area: Area to search for characters.
145        :param predicate: Function to discard characters in the area or include all by default.
146        :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default.
147        :return: A list of character lines sorted by x or y position.
148        """
149        if rtol is None:
150            rtol = self._spacing["sc"]
151        # Split all chars into lines based on rounded origin
152        origin_lines_y = defaultdict(list)
153        origin_lines_x = defaultdict(list)
154        for char in self.chars_in_area(area):
155            # Ignore all characters we don't want
156            if predicate is not None and not predicate(char):
157                continue
158            cunicode = self._unicode_filter(char.unicode)
159            if cunicode is None:
160                continue
161            char.unicode = cunicode
162            if char.unicode < 32 and char.unicode not in {0xA}:
163                continue
164            # Ignore characters without width that are not spaces
165            if not char.width and char.unicode not in {0xA, 0xD, 0x20}:
166                _LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
167            # Split up the chars depending on the orientation
168            if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
169                origin_lines_x[round(char.origin.x, 1)].append(char)
170            elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
171                origin_lines_y[round(char.origin.y, 1)].append(char)
172            else:
173                _LOGGER.error("Unknown char rotation:", char, char.rotation)
174
175        # Convert characters into lines
176        bbox_lines_y = []
177        for chars in origin_lines_y.values():
178            # Remove lines with whitespace only
179            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
180                continue
181            origin = statistics.fmean(c.origin.y for c in chars)
182            line = CharLine(
183                self,
184                chars,
185                min(c.bbox.bottom for c in chars),
186                origin,
187                max(c.bbox.top for c in chars),
188                max(c.height for c in chars),
189                sort_origin=self.height - origin,
190            )
191            bbox_lines_y.append(line)
192            # print(line, line.top, line.origin, line.bottom, line.height)
193        bbox_lines = sorted(bbox_lines_y, key=lambda line: line._sort_origin)
194
195        bbox_lines_x = []
196        for chars in origin_lines_x.values():
197            # Remove lines with whitespace only
198            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
199                continue
200            line = CharLine(
201                self,
202                chars,
203                min(c.bbox.left for c in chars),
204                statistics.fmean(c.origin.x for c in chars),
205                max(c.bbox.right for c in chars),
206                max(c.width for c in chars),
207                270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90,
208            )
209            bbox_lines_x.append(line)
210        bbox_lines += sorted(bbox_lines_x, key=lambda line: line._sort_origin)
211
212        if not bbox_lines:
213            return []
214
215        # Merge lines that have overlapping bbox_lines
216        # FIXME: This merges lines that "collide" vertically like in formulas
217        merged_lines = []
218        current_line = bbox_lines[0]
219        for next_line in bbox_lines[1:]:
220            height = max(current_line.height, next_line.height)
221            # Calculate overlap via normalize origin (increasing with line index)
222            if (current_line._sort_origin + rtol * height) > (next_line._sort_origin - rtol * height):
223                # if line.rotation or self.rotation:
224                #     # The next line overlaps this one, we merge the shorter line
225                #     # (typically super- and subscript) into taller line
226                #     use_current = len(current_line.chars) >= len(next_line.chars)
227                # else:
228                use_current = current_line.height >= next_line.height
229                line = current_line if use_current else next_line
230                current_line = CharLine(
231                    self,
232                    current_line.chars + next_line.chars,
233                    line.bottom,
234                    line.origin,
235                    line.top,
236                    height,
237                    line.rotation,
238                    sort_origin=line._sort_origin,
239                )
240            else:
241                # The next line does not overlap the current line
242                merged_lines.append(current_line)
243                current_line = next_line
244        # append last line
245        merged_lines.append(current_line)
246
247        # Sort all lines horizontally based on character origin
248        sorted_lines = []
249        for line in merged_lines:
250            if line.rotation == 90:
251
252                def sort_key(char):
253                    if char.unicode in {0xA, 0xD}:
254                        return char.tbbox.midpoint.y - 1e9
255                    return char.tbbox.midpoint.y
256            elif line.rotation == 270:
257
258                def sort_key(char):
259                    if char.unicode in {0xA, 0xD}:
260                        return -char.tbbox.midpoint.y + 1e9
261                    return -char.tbbox.midpoint.y
262            else:
263
264                def sort_key(char):
265                    if char.unicode in {0xA, 0xD}:
266                        return char.origin.x + 1e9
267                    return char.origin.x
268
269            sorted_lines.append(
270                CharLine(
271                    self,
272                    sorted(line.chars, key=sort_key),
273                    line.bottom,
274                    line.origin,
275                    line.top,
276                    line.height,
277                    line.rotation,
278                    area.left,
279                    sort_origin=line._sort_origin,
280                )
281            )
282
283        return sorted_lines
284
285    def graphic_bboxes_in_area(
286        self, area: Rectangle, with_graphics: bool = True
287    ) -> list[tuple[Rectangle, Table | Figure | None]]:
288        """
289        Coalesce the graphics in the area into full width bounding boxes.
290
291        1. Group vertically overlapping graphics.
292        2. Widen the overlapped graphics bounding boxes to the edges of the area.
293
294        :param area: area to search for content.
295        :param with_graphics: search for graphics in the area.
296        :return: list of tuples (bounding box, graphic objects or `None`).
297        """
298        if with_graphics:
299            graphics = self.graphics_in_area(area)
300            regions = []
301            # Check if graphics bounding boxes overlap vertically and group them
302            for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
303                gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
304                for reg in regions:
305                    if reg.overlaps(gbbox.bottom, gbbox.top):
306                        # They overlap, so merge them
307                        reg.v0 = min(reg.v0, gbbox.bottom)
308                        reg.v1 = max(reg.v1, gbbox.top)
309                        reg.objs.append(graphic)
310                        break
311                else:
312                    regions.append(Region(gbbox.bottom, gbbox.top, graphic))
313
314            # print(regions)
315            # Coalesce all overlapped graphics objects into full width areas
316            areas = []
317            ypos = area.top
318            for reg in regions:
319                if ypos - reg.v1 > self._spacing["y_em"]:
320                    areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
321                for obj in reg.objs:
322                    oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
323                    areas.append((oarea, obj))
324                ypos = reg.v0
325            areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
326        else:
327            areas = [(area, None)]
328        return areas
329
330    def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]:
331        """
332        Find all content objects in this area.
333
334        :param area: area to search for content.
335        :param with_graphics: search for graphics in the area.
336        :return: list of content objects sorted top to bottom.
337        """
338        self._link_characters()
339        areas = self.graphic_bboxes_in_area(area, with_graphics)
340        objects = []
341        for narea, obj in areas:
342            if obj is None:
343                objects += self.charlines_in_area(narea)
344            else:
345                oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
346
347                def predicate(c):
348                    return not obj.bbox.contains(c.origin)
349
350                lines = self.charlines_in_area(oarea, predicate)
351                # print(obj, oarea, lines, [line.content for line in lines])
352                objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
353        return objects
354
355    def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
356        """
357        Find all tables and figures in this area.
358
359        :param area: area to search for graphics.
360        :return: list of tables and figures.
361        """
362        return []
363
364    def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node:
365        """
366        Convert the area content into an abstract syntax tree.
367
368        :param area: area to search for content.
369        :param with_graphics: including graphics in the area.
370        :return: An abstract syntax tree including the content formatting.
371        """
372        return Node("area", obj=area, xpos=int(area.left), page=self)
373
374    @property
375    def content_ast(self) -> list[Node]:
376        """The abstract syntax trees in the content area."""
377        ast = []
378        with_graphics = True
379        for area in self._areas["content"]:
380            ast.append(self.ast_in_area(area, with_graphics=with_graphics))
381        # Add a page node to the first leaf to keep track of where a page starts
382        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
383        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
384        return ast
385
386    @property
387    def content_objects(self) -> list[CharLine | Table | Figure]:
388        """All objects in the content areas."""
389        objs = []
390        for area in self._areas["content"]:
391            objs.extend(self.objects_in_area(area))
392        return objs
393
394    @property
395    def content_graphics(self) -> list[Table | Figure]:
396        """All graphics in the content areas."""
397        objs = []
398        for area in self._areas["content"]:
399            objs.extend(self.graphics_in_area(area))
400        return objs
401
402    @property
403    def content_lines(self) -> list[CharLine]:
404        """All lines in the content areas."""
405        objs = []
406        for area in self._areas["content"]:
407            objs.extend(self.charlines_in_area(area))
408        return objs
409
410    @property
411    def content_tables(self) -> list[Table]:
412        """All tables in the content areas."""
413        return [o for o in self.content_graphics if isinstance(o, Table)]
414
415    @property
416    def content_figures(self) -> list[Figure]:
417        """All figures in the content areas."""
418        return [o for o in self.content_graphics if isinstance(o, Figure)]
419
420    def __repr__(self) -> str:
421        return f"Page({self.number})"

This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.

Page(document, index: int)
22    def __init__(self, document, index: int):
23        super().__init__(document, index)
24        self._template = "default"
25        self.is_relevant: bool = True
26        """Is this page relevant for the conversion?"""
Parameters
  • document: a PDF document.
  • index: 0-index page number.
is_relevant: bool

Is this page relevant for the conversion?

def text_in_named_area(self, name: str, check_length: bool = True) -> str | None:
109    def text_in_named_area(self, name: str, check_length: bool = True) -> str | None:
110        """
111        Find all text in the named area.
112
113        :param name: the name of the area(s) to query.
114        :param check_length: assert that the text has a length.
115        :return: the concatenated text of the named area(s) or `None` if area not found.
116        """
117        if name not in self._areas:
118            return None
119        text = ""
120        areas = self._areas[name]
121        if not isinstance(areas, list):
122            areas = [areas]
123        for area in areas:
124            text += self.text_in_area(area)
125        if check_length:
126            assert text
127        return text

Find all text in the named area.

Parameters
  • name: the name of the area(s) to query.
  • check_length: assert that the text has a length.
Returns

the concatenated text of the named area(s) or None if area not found.

def charlines_in_area( self, area: modm_data.utils.Rectangle, predicate: Callable[[modm_data.pdf.Character], bool] = None, rtol: float = None) -> list[modm_data.pdf2html.line.CharLine]:
129    def charlines_in_area(
130        self, area: Rectangle, predicate: Callable[[Character], bool] = None, rtol: float = None
131    ) -> list[CharLine]:
132        """
133        Coalesce the characters in the area and predicate into lines.
134
135        1. Every character in the area is filtered by the `predicate`.
136        2. Character orientation is split into horizontal (left->right) and
137           vertical (bottom->top) character lines sorted by x or y position.
138           Lines containing only whitespace are discarded.
139        3. Overlapping character lines are merged into sub- and superscript
140           using `rtol * max(current_line.height, next_line.height)` as the
141           tolerance for checking if the lines overlap.
142        4. The characters in the merged lines are re-sorted by origin.
143
144        :param area: Area to search for characters.
145        :param predicate: Function to discard characters in the area or include all by default.
146        :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default.
147        :return: A list of character lines sorted by x or y position.
148        """
149        if rtol is None:
150            rtol = self._spacing["sc"]
151        # Split all chars into lines based on rounded origin
152        origin_lines_y = defaultdict(list)
153        origin_lines_x = defaultdict(list)
154        for char in self.chars_in_area(area):
155            # Ignore all characters we don't want
156            if predicate is not None and not predicate(char):
157                continue
158            cunicode = self._unicode_filter(char.unicode)
159            if cunicode is None:
160                continue
161            char.unicode = cunicode
162            if char.unicode < 32 and char.unicode not in {0xA}:
163                continue
164            # Ignore characters without width that are not spaces
165            if not char.width and char.unicode not in {0xA, 0xD, 0x20}:
166                _LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
167            # Split up the chars depending on the orientation
168            if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
169                origin_lines_x[round(char.origin.x, 1)].append(char)
170            elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
171                origin_lines_y[round(char.origin.y, 1)].append(char)
172            else:
173                _LOGGER.error("Unknown char rotation:", char, char.rotation)
174
175        # Convert characters into lines
176        bbox_lines_y = []
177        for chars in origin_lines_y.values():
178            # Remove lines with whitespace only
179            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
180                continue
181            origin = statistics.fmean(c.origin.y for c in chars)
182            line = CharLine(
183                self,
184                chars,
185                min(c.bbox.bottom for c in chars),
186                origin,
187                max(c.bbox.top for c in chars),
188                max(c.height for c in chars),
189                sort_origin=self.height - origin,
190            )
191            bbox_lines_y.append(line)
192            # print(line, line.top, line.origin, line.bottom, line.height)
193        bbox_lines = sorted(bbox_lines_y, key=lambda line: line._sort_origin)
194
195        bbox_lines_x = []
196        for chars in origin_lines_x.values():
197            # Remove lines with whitespace only
198            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
199                continue
200            line = CharLine(
201                self,
202                chars,
203                min(c.bbox.left for c in chars),
204                statistics.fmean(c.origin.x for c in chars),
205                max(c.bbox.right for c in chars),
206                max(c.width for c in chars),
207                270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90,
208            )
209            bbox_lines_x.append(line)
210        bbox_lines += sorted(bbox_lines_x, key=lambda line: line._sort_origin)
211
212        if not bbox_lines:
213            return []
214
215        # Merge lines that have overlapping bbox_lines
216        # FIXME: This merges lines that "collide" vertically like in formulas
217        merged_lines = []
218        current_line = bbox_lines[0]
219        for next_line in bbox_lines[1:]:
220            height = max(current_line.height, next_line.height)
221            # Calculate overlap via normalize origin (increasing with line index)
222            if (current_line._sort_origin + rtol * height) > (next_line._sort_origin - rtol * height):
223                # if line.rotation or self.rotation:
224                #     # The next line overlaps this one, we merge the shorter line
225                #     # (typically super- and subscript) into taller line
226                #     use_current = len(current_line.chars) >= len(next_line.chars)
227                # else:
228                use_current = current_line.height >= next_line.height
229                line = current_line if use_current else next_line
230                current_line = CharLine(
231                    self,
232                    current_line.chars + next_line.chars,
233                    line.bottom,
234                    line.origin,
235                    line.top,
236                    height,
237                    line.rotation,
238                    sort_origin=line._sort_origin,
239                )
240            else:
241                # The next line does not overlap the current line
242                merged_lines.append(current_line)
243                current_line = next_line
244        # append last line
245        merged_lines.append(current_line)
246
247        # Sort all lines horizontally based on character origin
248        sorted_lines = []
249        for line in merged_lines:
250            if line.rotation == 90:
251
252                def sort_key(char):
253                    if char.unicode in {0xA, 0xD}:
254                        return char.tbbox.midpoint.y - 1e9
255                    return char.tbbox.midpoint.y
256            elif line.rotation == 270:
257
258                def sort_key(char):
259                    if char.unicode in {0xA, 0xD}:
260                        return -char.tbbox.midpoint.y + 1e9
261                    return -char.tbbox.midpoint.y
262            else:
263
264                def sort_key(char):
265                    if char.unicode in {0xA, 0xD}:
266                        return char.origin.x + 1e9
267                    return char.origin.x
268
269            sorted_lines.append(
270                CharLine(
271                    self,
272                    sorted(line.chars, key=sort_key),
273                    line.bottom,
274                    line.origin,
275                    line.top,
276                    line.height,
277                    line.rotation,
278                    area.left,
279                    sort_origin=line._sort_origin,
280                )
281            )
282
283        return sorted_lines

Coalesce the characters in the area and predicate into lines.

  1. Every character in the area is filtered by the predicate.
  2. Character orientation is split into horizontal (left->right) and vertical (bottom->top) character lines sorted by x or y position. Lines containing only whitespace are discarded.
  3. Overlapping character lines are merged into sub- and superscript using rtol * max(current_line.height, next_line.height) as the tolerance for checking if the lines overlap.
  4. The characters in the merged lines are re-sorted by origin.
Parameters
  • area: Area to search for characters.
  • predicate: Function to discard characters in the area or include all by default.
  • rtol: Relative tolerance to separate lines vertically or use sc spacing by default.
Returns

A list of character lines sorted by x or y position.

def graphic_bboxes_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True) -> list[tuple[modm_data.utils.Rectangle, modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure | None]]:
285    def graphic_bboxes_in_area(
286        self, area: Rectangle, with_graphics: bool = True
287    ) -> list[tuple[Rectangle, Table | Figure | None]]:
288        """
289        Coalesce the graphics in the area into full width bounding boxes.
290
291        1. Group vertically overlapping graphics.
292        2. Widen the overlapped graphics bounding boxes to the edges of the area.
293
294        :param area: area to search for content.
295        :param with_graphics: search for graphics in the area.
296        :return: list of tuples (bounding box, graphic objects or `None`).
297        """
298        if with_graphics:
299            graphics = self.graphics_in_area(area)
300            regions = []
301            # Check if graphics bounding boxes overlap vertically and group them
302            for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
303                gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
304                for reg in regions:
305                    if reg.overlaps(gbbox.bottom, gbbox.top):
306                        # They overlap, so merge them
307                        reg.v0 = min(reg.v0, gbbox.bottom)
308                        reg.v1 = max(reg.v1, gbbox.top)
309                        reg.objs.append(graphic)
310                        break
311                else:
312                    regions.append(Region(gbbox.bottom, gbbox.top, graphic))
313
314            # print(regions)
315            # Coalesce all overlapped graphics objects into full width areas
316            areas = []
317            ypos = area.top
318            for reg in regions:
319                if ypos - reg.v1 > self._spacing["y_em"]:
320                    areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
321                for obj in reg.objs:
322                    oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
323                    areas.append((oarea, obj))
324                ypos = reg.v0
325            areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
326        else:
327            areas = [(area, None)]
328        return areas

Coalesce the graphics in the area into full width bounding boxes.

  1. Group vertically overlapping graphics.
  2. Widen the overlapped graphics bounding boxes to the edges of the area.
Parameters
  • area: area to search for content.
  • with_graphics: search for graphics in the area.
Returns

list of tuples (bounding box, graphic objects or None).

def objects_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True) -> list[modm_data.pdf2html.line.CharLine | modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure]:
330    def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]:
331        """
332        Find all content objects in this area.
333
334        :param area: area to search for content.
335        :param with_graphics: search for graphics in the area.
336        :return: list of content objects sorted top to bottom.
337        """
338        self._link_characters()
339        areas = self.graphic_bboxes_in_area(area, with_graphics)
340        objects = []
341        for narea, obj in areas:
342            if obj is None:
343                objects += self.charlines_in_area(narea)
344            else:
345                oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
346
347                def predicate(c):
348                    return not obj.bbox.contains(c.origin)
349
350                lines = self.charlines_in_area(oarea, predicate)
351                # print(obj, oarea, lines, [line.content for line in lines])
352                objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
353        return objects

Find all content objects in this area.

Parameters
  • area: area to search for content.
  • with_graphics: search for graphics in the area.
Returns

list of content objects sorted top to bottom.

def graphics_in_area( self, area: modm_data.utils.Rectangle) -> list[modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure]:
355    def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
356        """
357        Find all tables and figures in this area.
358
359        :param area: area to search for graphics.
360        :return: list of tables and figures.
361        """
362        return []

Find all tables and figures in this area.

Parameters
  • area: area to search for graphics.
Returns

list of tables and figures.

def ast_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True) -> anytree.node.node.Node:
364    def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node:
365        """
366        Convert the area content into an abstract syntax tree.
367
368        :param area: area to search for content.
369        :param with_graphics: including graphics in the area.
370        :return: An abstract syntax tree including the content formatting.
371        """
372        return Node("area", obj=area, xpos=int(area.left), page=self)

Convert the area content into an abstract syntax tree.

Parameters
  • area: area to search for content.
  • with_graphics: including graphics in the area.
Returns

An abstract syntax tree including the content formatting.

content_ast: list[anytree.node.node.Node]
374    @property
375    def content_ast(self) -> list[Node]:
376        """The abstract syntax trees in the content area."""
377        ast = []
378        with_graphics = True
379        for area in self._areas["content"]:
380            ast.append(self.ast_in_area(area, with_graphics=with_graphics))
381        # Add a page node to the first leaf to keep track of where a page starts
382        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
383        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
384        return ast

The abstract syntax trees in the content area.

386    @property
387    def content_objects(self) -> list[CharLine | Table | Figure]:
388        """All objects in the content areas."""
389        objs = []
390        for area in self._areas["content"]:
391            objs.extend(self.objects_in_area(area))
392        return objs

All objects in the content areas.

394    @property
395    def content_graphics(self) -> list[Table | Figure]:
396        """All graphics in the content areas."""
397        objs = []
398        for area in self._areas["content"]:
399            objs.extend(self.graphics_in_area(area))
400        return objs

All graphics in the content areas.

content_lines: list[modm_data.pdf2html.line.CharLine]
402    @property
403    def content_lines(self) -> list[CharLine]:
404        """All lines in the content areas."""
405        objs = []
406        for area in self._areas["content"]:
407            objs.extend(self.charlines_in_area(area))
408        return objs

All lines in the content areas.

content_tables: list[modm_data.pdf2html.table.Table]
410    @property
411    def content_tables(self) -> list[Table]:
412        """All tables in the content areas."""
413        return [o for o in self.content_graphics if isinstance(o, Table)]

All tables in the content areas.

content_figures: list[modm_data.pdf2html.figure.Figure]
415    @property
416    def content_figures(self) -> list[Figure]:
417        """All figures in the content areas."""
418        return [o for o in self.content_graphics if isinstance(o, Figure)]

All figures in the content areas.

Inherited Members
modm_data.pdf.page.Page
index
number
label
width
height
rotation
bbox
char_count
char
chars
chars_in_area
text_in_area
structures
find
paths
images
graphic_clusters
pypdfium2._helpers.page.PdfPage
parent
get_width
get_height
get_size
get_rotation
set_rotation
get_mediabox
set_mediabox
get_cropbox
set_cropbox
get_bleedbox
set_bleedbox
get_trimbox
set_trimbox
get_artbox
set_artbox
get_bbox
get_textpage
insert_obj
remove_obj
gen_content
get_objects
render
pypdfium2.internal.bases.AutoCloseable
close