modm_data.pdf2html.stmicro.page

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4import re
  5import math
  6import logging
  7import textwrap
  8import statistics
  9from functools import cached_property, cache, reduce
 10from collections import defaultdict
 11from .table import Table
 12from ..figure import Figure
 13from ..line import CharLine
 14from ...utils import HLine, VLine, Rectangle, Region
 15from ...pdf import Path, Image, Page as PdfPage
 16from anytree import Node
 17
 18
 19LOGGER = logging.getLogger(__name__)
 20
 21def is_compatible(document) -> bool:
 22    if "stmicro" in document.metadata.get("Author", "").lower():
 23        return True
 24    return False
 25
 26
 27def areas_black_white(page) -> dict:
 28    def _scale(r):
 29        if page.rotation:
 30            return Rectangle(r.bottom * page.width, (1 - r.right) * page.height,
 31                             r.top * page.width, (1 - r.left) * page.height)
 32        return Rectangle(r.left * page.width, r.bottom * page.height,
 33                         r.right * page.width, r.top * page.height)
 34
 35    bottom_left = Rectangle(0.1, 0.1, 0.3, 0.12)
 36    bottom_middle = Rectangle(0.3, 0.1, 0.7, 0.12)
 37    bottom_right = Rectangle(0.7, 0.1, 0.9, 0.12)
 38    top = Rectangle(0.1, 0.9125, 0.9, 0.9375)
 39    content = Rectangle(0.025, 0.12, 0.975, 0.905 if page.index else 0.79)
 40    all_content = [content]
 41    areas = {
 42        # Bottom string in the middle: Example "RM0410 Rev 4"
 43        "id": bottom_middle,
 44    }
 45    if page.index == 0:
 46        # Publish date on the bottom left on first page
 47        areas["date"] = bottom_left
 48        # number on the bottom right on first page
 49        areas["number"] = bottom_right
 50        # Add top areas
 51        all_content.insert(0, Rectangle(0.375, 0.855, 0.975, 0.9125))
 52        all_content.insert(1, Rectangle(0.025, 0.805, 0.975, 0.855))
 53    else:
 54        # Page number on bottom
 55        areas["number"] = bottom_left if page.index % 2 else bottom_right
 56        # Chapter name on top
 57        areas["top"] = top
 58
 59    # Recognize the two column design of the Datasheets with a big table underneath
 60    if page.index < 3 and "DS" in page.pdf.name:
 61        # Find a wide path that would denote the beginning of a table
 62        top_rect = [p.bbox.top / page.height for p in page.paths
 63                    if _scale(content).contains(p.bbox) and p.bbox.width > page.width * 0.75]
 64        if top_rect:
 65            # offset for table label just above it
 66            ybottom = max(*top_rect) + 0.0175
 67        else:
 68            ybottom = content.bottom
 69        # Try to find list or sublists in these areas
 70        mr = Rectangle(0.49, ybottom, 0.51, content.top)
 71        br = Rectangle(0.51, ybottom, 0.5325, content.top)
 72        hr = Rectangle(0.5325, ybottom, 0.555, content.top)
 73        text_middle = page.text_in_area(_scale(mr))
 74        text_bullets = page.text_in_area(_scale(br))
 75        text_hyphens = page.text_in_area(_scale(hr))
 76        if (not text_middle and
 77            (any(c in text_bullets for c in {"•", chr(61623)}) or
 78             any(c in text_hyphens for c in {"-"}))):
 79            areas["middle_bullets"] = br
 80            areas["middle_hyphens"] = hr
 81            all_content = all_content[:-1]
 82            all_content.append(Rectangle(content.left, ybottom, 0.5, content.top))
 83            all_content.append(Rectangle(0.505, ybottom, content.right, content.top))
 84            if top_rect:
 85                all_content.append(Rectangle(content.left, content.bottom, content.right, ybottom))
 86
 87    areas["content"] = all_content
 88    scaled_areas = {}
 89    for name, area in areas.items():
 90        if isinstance(area, list):
 91            scaled_areas[name] = [_scale(r) for r in area]
 92        else:
 93            scaled_areas[name] = _scale(area)
 94    return scaled_areas
 95
 96
 97def areas_blue_gray(page) -> dict:
 98    def _scale(r):
 99        return Rectangle(r.left * page.width, r.bottom * page.height,
100                         r.right * page.width, r.top * page.height)
101
102    # This template doesn't use rotated pages, instead uses
103    # hardcoded rotated page dimensions
104    if page.width > page.height:
105        content = Rectangle(0.05, 0.025, 0.89, 0.975)
106        bottom_left = Rectangle(0, 0.6, 0.05, 1)
107        top_right = Rectangle(0.9025, 0.05, 0.9175, 0.7)
108    else:
109        content = Rectangle(0.025, 0.05, 0.975, 0.89 if page.index else 0.81)
110        bottom_left = Rectangle(0, 0, 0.4, 0.05)
111        top_right = Rectangle(0.3, 0.9025, 0.95, 0.9175)
112    areas = {
113        "id": bottom_left,
114        "top": top_right,
115        "all_content": content,
116        "content": []
117    }
118    if page.index == 0:
119        areas["content"] = [
120            # Document device string
121            Rectangle(0.4, 0.91, 0.95, 0.95),
122            # Document description string
123            Rectangle(0.05, 0.81, 0.95, 0.86)
124        ]
125    if page.index < 10:
126        # Contains only a table with product summary
127        br = Rectangle(0.35, content.bottom, 0.37, content.top)
128        text_bullets = page.text_in_area(_scale(br))
129        if any(c in text_bullets for c in {"•", chr(61623)}):
130            areas["middle_bullets"] = br
131            # Contains the actual content here
132            left = Rectangle(content.left, content.bottom, 0.3565, content.top)
133            right = Rectangle(0.3565, content.bottom, content.right, content.top)
134            areas["content"].extend([left, right])
135        else:
136            areas["content"] = [content]
137    else:
138        areas["content"] = [content]
139
140    scaled_areas = {}
141    for name, area in areas.items():
142        if isinstance(area, list):
143            scaled_areas[name] = [_scale(r) for r in area]
144        else:
145            scaled_areas[name] = _scale(area)
146    return scaled_areas
147
148
149def spacing_black_white(page) -> dict:
150    content = 0.1125
151    spacing = {
152        # Horizontal spacing: left->right
153        "x_em": 0.01 * page.width,
154        "x_left": content * page.width,
155        "x_right": (1 - content) * page.width,
156        "x_content": 0.2075 * page.width,
157        # Vertical spacing: bottom->top
158        "y_em": 0.01 * page.height,
159        # Max table line thickness
160        "y_tline": 0.005 * page.height,
161        # Max line height distance to detect paragraphs
162        "lh": 0.9,
163        # Max line height distance to detect super-/subscript
164        "sc": 0.325,
165        # Table header cell bold text threshold
166        "th": 0.33,
167    }
168    if page.rotation:
169        content = 0.14
170        spacing.update({
171            "x_em": 0.01 * page.height,
172            "y_em": 0.01 * page.width,
173            "x_left": content * page.width,
174            "x_right": (1 - content) * page.width,
175            "x_content": 0.2075 * page.width,
176            "y_tline": 0.005 * page.width,
177            "lh": 1.2,
178            "sc": 0.4,
179        })
180    return spacing
181
182
183def spacing_blue_gray(page) -> dict:
184    content = 0.07
185    spacing = {
186        # Horizontal spacing: left->right
187        "x_em": 0.01 * page.width,
188        "x_left": content * page.width,
189        "x_right": (1 - content) * page.width,
190        "x_content": 0.165 * page.width,
191        # Vertical spacing: bottom->top
192        "y_em": 0.01 * page.height,
193        # Max table line thickness
194        "y_tline": 0.005 * page.height,
195        # Max line height distance to detect paragraphs
196        "lh": 0.9,
197        # Max line height distance to detect super-/subscript
198        "sc": 0.3,
199        # Table header cell bold text threshold
200        "th": 0.33,
201    }
202    if page.rotation:
203        spacing.update({
204            "x_em": 0.01 * page.height,
205            "y_em": 0.01 * page.width,
206            "x_left": 0.05 * page.width,
207            "x_right": (1 - 0.16) * page.width,
208            "x_content": 0.2075 * page.width,
209            "y_tline": 0.005 * page.width,
210            "lh": 1.6,
211            "sc": 0.2,
212        })
213    return spacing
214
215
216def linesize_black_white(line: float) -> str:
217    rsize = line.height
218    if rsize >= 17.5: return "h1"
219    elif rsize >= 15.5: return "h2"
220    elif rsize >= 13.5: return "h3"
221    elif rsize >= 11.4: return "h4"
222    elif rsize >= 8.5: return "n"
223    else: return "fn"
224
225
226def linesize_blue_gray(line: float) -> str:
227    rsize = round(line.height)
228    if rsize >= 16: return "h1"
229    elif rsize >= 14: return "h2"
230    elif rsize >= 12: return "h3"
231    elif rsize >= 10: return "h4"
232    elif rsize >= 7: return "n"
233    else: return "fn"
234
235
236def colors_black_white(color: int) -> str:
237    if 0xff <= color <= 0xff:
238        return "black"
239    if 0xffffffff <= color <= 0xffffffff:
240        return "white"
241    return "unknown"
242
243
244def colors_blue_gray(color: int) -> str:
245    if 0xff <= color <= 0xff:
246        return "black"
247    if 0xffffffff <= color <= 0xffffffff:
248        return "white"
249    if 0xb9c4caff <= color <= 0xb9c4caff:
250        return "gray"
251    if 0x1f81afff <= color <= 0x1f81afff:
252        return "lightblue"
253    if 0x2052ff <= color <= 0x2052ff:
254        return "darkblue"
255    if 0x39a9dcff <= color <= 0x39a9dcff:
256        return "blue"
257    return "unknown"
258
259
260class Page(PdfPage):
261
262    def __init__(self, document, index: int):
263        super().__init__(document, index)
264        self._template = "black_white"
265        producer = self.pdf.metadata.get("Producer", "").lower()
266        if "acrobat" in producer:
267            pass # default
268        elif "antenna" in producer:
269            self._template = "blue_gray"
270        else:
271            LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
272
273        if "blue_gray" in self._template:
274            self._areas = areas_blue_gray(self)
275            self._spacing = spacing_blue_gray(self)
276            self._colors = colors_blue_gray
277            self._line_size = linesize_blue_gray
278        elif "black_white" in self._template:
279            self._areas = areas_black_white(self)
280            self._spacing = spacing_black_white(self)
281            self._colors = colors_black_white
282            self._line_size = linesize_black_white
283
284        # Patches to detect the header cells correctly
285        if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or
286            (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))):
287            self._spacing["th"] = 0.1
288        if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or
289            (self.pdf.name == "RM0456-v2" and self.index in [2881]) or
290            (self.pdf.name == "RM0456-v3" and self.index in [2880]) or
291            (self.pdf.name == "RM0461-v4" and self.index in [1246])):
292            self._spacing["th"] = 0.5
293        if ((self.pdf.name == "RM0456-v2" and self.index in [3005])):
294            self._spacing["th"] = 0.52
295
296    def _text_in_area(self, name, check_length=True) -> str:
297        if name not in self._areas: return ""
298        text = ""
299        areas = self._areas[name]
300        if not isinstance(areas, list): areas = [areas]
301        for area in areas:
302            text += self.text_in_area(area)
303        if check_length: assert text
304        return text
305
306    @cached_property
307    def identifier(self) -> str:
308        return self._text_in_area("id", check_length=False)
309
310    @cached_property
311    def top(self) -> str:
312        if self.index == 0:
313            return "Cover"
314        return self._text_in_area("top", check_length=False)
315
316    def is_relevant(self) -> bool:
317        if any(c in self.top for c in {"Contents", "List of ", "Index"}):
318            return False
319        return True
320
321    def _charlines_filtered(self, area, predicate = None, rtol = None) -> list[CharLine]:
322        if rtol is None: rtol = self._spacing["sc"]
323        # Split all chars into lines based on rounded origin
324        origin_lines_y = defaultdict(list)
325        origin_lines_x = defaultdict(list)
326        for char in self.chars_in_area(area):
327            # Ignore all characters we don't want
328            if predicate is not None and not predicate(char):
329                continue
330            # Ignore Carriage Return characters and ® (superscript issues)
331            if char.unicode in {0xd, ord("®")}:
332                continue
333            # Correct some weird unicode stuffing choices
334            if char.unicode in {2}:
335                char.unicode = ord("-")
336            if char.unicode in {61623, 61664}:
337                char.unicode = ord("•")
338            if char.unicode < 32 and char.unicode not in {0xa}:
339                continue
340            # Ignore characters without width that are not spaces
341            if not char.width and char.unicode not in {0xa, 0xd, 0x20}:
342                LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
343            # Split up the chars depending on the orientation
344            if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
345                origin_lines_x[round(char.origin.x, 1)].append(char)
346            elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
347                origin_lines_y[round(char.origin.y, 1)].append(char)
348            else:
349                LOGGER.error("Unknown char rotation:", char, char.rotation)
350
351        # Convert characters into lines
352        bbox_lines_y = []
353        for chars in origin_lines_y.values():
354            # Remove lines with whitespace only
355            if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
356                continue
357            origin = statistics.fmean(c.origin.y for c in chars)
358            line = CharLine(self, chars,
359                            min(c.bbox.bottom for c in chars),
360                            origin,
361                            max(c.bbox.top for c in chars),
362                            max(c.height for c in chars),
363                            sort_origin=self.height - origin)
364            bbox_lines_y.append(line)
365            # print(line, line.top, line.origin, line.bottom, line.height)
366        bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin)
367
368        bbox_lines_x = []
369        for chars in origin_lines_x.values():
370            # Remove lines with whitespace only
371            if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
372                continue
373            line = CharLine(self, chars,
374                            min(c.bbox.left for c in chars),
375                            statistics.fmean(c.origin.x for c in chars),
376                            max(c.bbox.right for c in chars),
377                            max(c.width for c in chars),
378                            270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90)
379            bbox_lines_x.append(line)
380        bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin)
381
382        if not bbox_lines:
383            return []
384
385        # Merge lines that have overlapping bbox_lines
386        # FIXME: This merges lines that "collide" vertically like in formulas
387        merged_lines = []
388        current_line = bbox_lines[0]
389        for next_line in bbox_lines[1:]:
390            height = max(current_line.height, next_line.height)
391            # Calculate overlap via normalize origin (increasing with line index)
392            if ((current_line._sort_origin + rtol * height) >
393                (next_line._sort_origin - rtol * height)):
394                # if line.rotation or self.rotation:
395                #     # The next line overlaps this one, we merge the shorter line
396                #     # (typically super- and subscript) into taller line
397                #     use_current = len(current_line.chars) >= len(next_line.chars)
398                # else:
399                use_current = current_line.height >= next_line.height
400                line = current_line if use_current else next_line
401                current_line = CharLine(self, current_line.chars + next_line.chars,
402                                        line.bottom, line.origin, line.top,
403                                        height, line.rotation,
404                                        sort_origin=line._sort_origin)
405            else:
406                # The next line does not overlap the current line
407                merged_lines.append(current_line)
408                current_line = next_line
409        # append last line
410        merged_lines.append(current_line)
411
412        # Sort all lines horizontally based on character origin
413        sorted_lines = []
414        for line in merged_lines:
415            if line.rotation == 90:
416                def sort_key(char):
417                    if char.unicode in {0xa, 0xd}:
418                        return char.tbbox.midpoint.y - 1e9
419                    return char.tbbox.midpoint.y
420            elif line.rotation == 270:
421                def sort_key(char):
422                    if char.unicode in {0xa, 0xd}:
423                        return -char.tbbox.midpoint.y + 1e9
424                    return -char.tbbox.midpoint.y
425            else:
426                def sort_key(char):
427                    if char.unicode in {0xa, 0xd}:
428                        return char.origin.x + 1e9
429                    return char.origin.x
430            sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key),
431                                         line.bottom, line.origin,
432                                         line.top, line.height,
433                                         line.rotation, area.left,
434                                         sort_origin=line._sort_origin))
435
436        return sorted_lines
437
438    def _content_areas(self, area: Rectangle, with_graphics: bool = True) -> list:
439        if with_graphics:
440            graphics = self._graphics_filtered(area)
441            regions = []
442            for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
443                gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
444                for reg in regions:
445                    if reg.overlaps(gbbox.bottom, gbbox.top):
446                        # They overlap, so merge them
447                        reg.v0 = min(reg.v0, gbbox.bottom)
448                        reg.v1 = max(reg.v1, gbbox.top)
449                        reg.objs.append(graphic)
450                        break
451                else:
452                    regions.append(Region(gbbox.bottom, gbbox.top, graphic))
453
454            # print(regions)
455            areas = []
456            ypos = area.top
457            for reg in regions:
458                if ypos - reg.v1 > self._spacing["y_em"]:
459                    areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
460                for obj in reg.objs:
461                    oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
462                    areas.append((oarea, obj))
463                ypos = reg.v0
464            areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
465        else:
466            areas = [(area, None)]
467        return areas
468
469    def _objects_filtered(self, area: Rectangle, with_graphics: bool = True) -> list:
470        self._link_characters()
471        areas = self._content_areas(area, with_graphics)
472        objects = []
473        for narea, obj in areas:
474            if obj is None:
475                objects += self._charlines_filtered(narea)
476            else:
477                oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
478                predicate = lambda c: not obj.bbox.contains(c.origin)
479                lines = self._charlines_filtered(oarea, predicate)
480                # print(obj, oarea, lines, [line.content for line in lines])
481                objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
482        return objects
483
484    @property
485    def content_ast(self) -> list:
486        ast = []
487        with_graphics = True
488        if "DS" in self.pdf.name:
489            # FIXME: Terrible hack to get the ordering information table fixed
490            # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable
491            order_page = next((item.page_index for item in self.pdf.toc if item.level == 0 and
492                               re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1)
493            with_graphics = (order_page != self.index)
494        for area in self._areas["content"]:
495            ast.append(self._ast_filtered(area, with_graphics=with_graphics))
496        # Add a page node to the first leaf to keep track of where a page starts
497        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
498        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
499        return ast
500
501    def _graphics_filtered(self, area) -> list:
502        # Find all graphic clusters in this area
503        em = self._spacing["y_em"]
504        large_area = area.offset_x(em/2)
505        graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em/2)
506        # for bbox, paths in raw_graphic_clusters:
507        #     # Some docs have large DRAFT chars in the background
508        #     if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths):
509        #         continue
510        #     graphic_clusters.append((bbox, paths))
511
512        # Find the captions and group them by y origin to catch side-by-side figures
513        ycaptions = defaultdict(list)
514        for line in self._charlines_filtered(area, lambda c: "Bold" in c.font):
515            for cluster in line.clusters():
516                for phrase in [r"Figure \d+\.", r"Table \d+\."]:
517                    if re.match(phrase, cluster.content):
518                        ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars))
519        ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)]
520
521        # Now associate these captions with the graphics bboxes
522        categories = []
523        for captions in ycaptions:
524            width = area.width / len(captions)
525            for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)):
526                left, right = area.left + ii * width, area.left + (ii + 1) * width
527                bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height
528
529                # Find the graphic associated with this caption
530                graphic = next(((b, p) for b, p in graphic_clusters
531                                if b.bottom <= bottom and
532                                   left <= b.left and b.right <= right), None)
533                if graphic is None:
534                    LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}")
535                    continue
536
537                if self._template == "blue_gray":
538                    # Search for all lines of the current caption with the same properties
539                    cbbox = Rectangle(left, bottom, right, top)
540                    cchars = self.chars_in_area(cbbox)
541                    while True:
542                        nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top)
543                        nchars = self.chars_in_area(nbbox)
544                        if len(cchars) >= len(nchars):
545                            break
546                        cbbox = nbbox
547                        cchars = nchars
548                elif self._template == "black_white":
549                    cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top)
550
551                otype = phrase.split(" ")[0].lower()
552                if "Figure" in phrase:
553                    # Find all other graphics in the bounding box
554                    gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom)
555                    graphics = []
556                    for b, p in graphic_clusters:
557                        if gbbox.overlaps(b):
558                            graphics.append((b,p))
559                    for g in graphics:
560                        graphic_clusters.remove(g)
561                    gbbox = [cluster[0] for cluster in graphics]
562                    gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox)
563                    paths = [p for cluster in graphics for p in cluster[1]]
564
565                    if self._template == "blue_gray":
566                        # Search for characters below the graphics bbox, max 1 y_em
567                        gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom)
568                        while True:
569                            gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom)
570                            if not self.chars_in_area(gbbox):
571                                break
572                    # Generate the new bounding box which includes the caption
573                    gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom)
574                elif "Table" in phrase:
575                    graphic_clusters.remove(graphic)
576                    gbbox, paths = graphic
577                    if (self._template == "black_white" and
578                        sum(1 for path in paths if path.count == 2) >= len(paths) / 2):
579                        otype += "_lines"
580                categories.append((otype, cbbox, gbbox, paths))
581
582        # Deal with the remaining graphic categories
583        for gbbox, paths in graphic_clusters:
584            if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]:
585                continue
586            if any(isinstance(p, Image) for p in paths):
587                category = "figure"
588            elif self._template == "blue_gray":
589                if all(self._colors(path.stroke) == "gray" or
590                       self._colors(path.fill) == "darkblue" for path in paths):
591                    category = "table"
592                else:
593                    category = "figure"
594            elif self._template == "black_white":
595                # Some tables are rendered explicitly with filled rectangular
596                # shapes with others are implicitly rendered with stroked lines
597                stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2
598                is_table = stroked_table_lines or all(
599                    [any(p.isclose(pp) for pp in path.bbox.points)
600                     for p in path.points].count(True) >= len(path.points) * 2 / 3
601                    for path in paths)
602                if (len(paths) > 1 and is_table):
603                    category = "table"
604                    if stroked_table_lines:
605                        category += "_lines"
606                else:
607                    category = "figure"
608
609            if "table" in category:
610                # Check if there are only numbers on top of the table
611                cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"])
612                nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xa, 0xd}]
613
614                if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3:
615                    # This is a register table with invisible top borders!
616                    cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars))
617                    gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top)
618                    name = "register_" + category
619                else:
620                    cbbox = None
621                    name = category
622                categories.append((name, cbbox, gbbox, paths))
623            else:
624                categories.append(("figure", None, gbbox, paths))
625
626        # Convert the objects into specialized classes
627        categories.sort(key=lambda o: (-o[2].y, o[2].x))
628        objects = []
629        for otype, caption_bbox, graphics_bbox, graphics_paths in categories:
630            if "figure" in otype:
631                figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths)
632                objects.append(figure)
633            elif "table" in otype:
634                xlines, ylines, yhlines = [], [], []
635                for path in graphics_paths:
636                    if self._template == "blue_gray" or "_lines" in otype:
637                        if self._colors(path.stroke) == "gray" or "_lines" in otype:
638                            # Intercell paths in gray
639                            if len(path.lines) == 1:
640                                line = path.lines[0]
641                                if line.direction == line.Direction.VERTICAL:
642                                    xlines.append(line.specialize())
643                                elif line.direction == line.Direction.HORIZONTAL:
644                                    ylines.append(line.specialize())
645                                else:
646                                    LOGGER.warn(f"Line not vertical or horizontal: {line}")
647                            else:
648                                LOGGER.warn(f"Path too long: {path}")
649                        elif self._colors(path.fill) == "darkblue":
650                            # Add the bottom line of the dark blue header box as a very thick line
651                            line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5)
652                            yhlines.append(line)
653
654                    elif self._template == "black_white":
655                        bbox = path.bbox
656                        is_vertical = bbox.width < bbox.height
657                        width = bbox.width if is_vertical else bbox.height
658                        length = bbox.height if is_vertical else bbox.width
659                        if width <= self._spacing["x_em"] / 2:
660                            if length >= self._spacing["y_em"] / 2:
661                                if is_vertical:
662                                    line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width)
663                                    xlines.append(line)
664                                else:
665                                    line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height)
666                                    ylines.append(line)
667                        else:
668                            # Split the rectangle into it's outline
669                            xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1))
670                            xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1))
671                            ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1))
672                            ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1))
673                if yhlines:
674                    yhlines.sort(key=lambda l: l.p0.y)
675                    ylines.append(yhlines[0])
676                if not xlines or not ylines:
677                    continue
678                table = Table(self, graphics_bbox, xlines, ylines, caption_bbox,
679                              is_register="register" in otype)
680                objects.append(table)
681
682        return objects
683
684    @property
685    def content_objects(self) -> list:
686        objs = []
687        for area in self._areas["content"]:
688            objs.extend(self._objects_filtered(area))
689        return objs
690
691    @property
692    def content_graphics(self) -> list:
693        objs = []
694        for area in self._areas["content"]:
695            objs.extend(self._graphics_filtered(area))
696        return objs
697
698    @property
699    def content_lines(self) -> list:
700        return [o for o in self.content_objects if isinstance(o, CharLine)]
701
702    @property
703    def content_tables(self) -> list:
704        return [o for o in self.content_graphics if isinstance(o, Table)]
705
706    @property
707    def content_figures(self) -> list:
708        return [o for o in self.content_graphics if isinstance(o, Figure)]
709
710    def _char_properties(self, line, char):
711        cp = {
712            "superscript": False,
713            "subscript": False,
714            "bold": any(frag in char.font for frag in {"Bold"}),
715            "italic": any(frag in char.font for frag in {"Italic", "Oblique"}),
716            "underline": (char.objlink or char.weblink) is not None,
717            "size": round(line.height),
718            "relsize": self._line_size(line),
719            "char": chr(char.unicode),
720        }
721
722        if line.rotation:
723            if char.origin.x < (line.origin - 0.25 * line.height):
724                cp["superscript"] = True
725            elif char.origin.x > (line.origin + 0.15 * line.height):
726                cp["subscript"] = True
727        elif char.origin.y > (line.origin + 0.25 * line.height):
728            cp["superscript"] = True
729        elif char.origin.y < (line.origin - 0.15 * line.height):
730            cp["subscript"] = True
731
732        return cp
733
734    def _ast_filtered(self, area: Rectangle, with_graphics=True,
735                      ignore_xpos=False, with_bits=True, with_notes=True) -> list:
736        x_em = self._spacing["x_em"]
737        spacing_content = self._spacing["x_content"]
738        lh_factor = self._spacing["lh"]
739        # spacing_y = self._spacing["y_em"]
740        root = Node("area", obj=area, xpos=int(area.left), page=self)
741
742        def unindent(_xpos, _current, _newlines=1):
743            current = _current
744            # Check if we need to unindent the current node
745            while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos:
746                current = current.parent
747            if _newlines >= 2 and current.name == "para":
748                current = current.parent
749            return current
750
751        def parent_name(current):
752            return "" if current.parent is None else current.parent.name
753
754        current = root
755        ypos = area.top
756        for obj in self._objects_filtered(area, with_graphics):
757            xpos = round(obj.bbox.left)
758            # Tables should remain in their current hierarchy regardless of indentation
759            if isinstance(obj, (Table, Figure)):
760                current = next((c for c in current.iter_path_reverse()
761                                if c.name.startswith("head")), root)
762                name = "figure" if isinstance(obj, Figure) else "table"
763                Node(name, parent=current, obj=obj, xpos=xpos, number=-1,
764                     _width=obj.bbox.width / area.width, _type=obj._type)
765                ypos = obj.bbox.bottom
766            # Lines of text need to be carefully checked for indentation
767            elif isinstance(obj, CharLine):
768                newlines = round((ypos - obj.origin) / (lh_factor * obj.height))
769                content = obj.content
770                lcontent = content.lstrip()
771                content_start = 0
772                linesize = self._line_size(obj)
773
774                # Check when the note has finished (=> paragraphs without italic)
775                if (parent_name(current) == "note" and
776                    ((current.parent.type == "note" and not obj.contains_font(current.parent._font)) or
777                     (current.parent.type in {"caution", "warning"} and newlines >= 2))):
778                    current = current.parent.parent
779
780                # Check when the list ends into something indented far too right
781                elif (parent_name(current).startswith("list")
782                      and (xpos - current.xpos) >= 2 * x_em):
783                    current = current.parent.parent
784
785                # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content)
786                # Check if line is a heading, which may be multi-line, so we must
787                # be careful not to nest them, but group them properly
788                # Headings are always inserted into the root note!
789                if linesize.startswith("h1") or (linesize.startswith("h") and
790                        xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font):
791                    if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None:
792                        start = min(len(match.group(0)), len(obj.chars) - 1)
793                        marker = match.group(1)
794                        size = marker.count('.') + 2
795                    else:
796                        start = 0
797                        marker = None
798                        size = linesize[1]
799                    name = f"head{size}"
800                    # Check if we're already parsing a heading, do not split into two
801                    if parent_name(current) != name or newlines > 2:
802                        content_start = start
803                        xpos = round(obj.chars[content_start].bbox.left)
804                        current = Node(name, parent=root, obj=obj, xpos=xpos,
805                                       size=size, marker=marker)
806                        current = Node("para", parent=current, obj=obj, xpos=current.xpos)
807
808                # Check if the line is a note and deal with the indentation correctly
809                elif with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None:
810                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
811                    # print(obj.fonts)
812                    # Correct xposition only if the Note: string is very far left
813                    if xpos + 4 * x_em <= current.xpos:
814                        xpos = round(obj.chars[content_start].bbox.left)
815                    # Prevent nesting of notes, they should only be listed
816                    if parent_name(current) == "note":
817                        current =  current.parent.parent
818                    current = unindent(xpos, current, 2)
819                    current = Node("note", parent=current, obj=obj, xpos=xpos,
820                                   type=match.group(1).lower(), _font=obj.chars[content_start].font)
821                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
822
823                # Check if line is Table or Figure caption
824                elif with_graphics and ((match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None
825                      and "Bold" in obj.chars[0].font):
826                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
827                    current = next((c for c in current.iter_path_reverse()
828                                if c.name.startswith("head")), root)
829                    current = Node("caption", parent=current, obj=obj, xpos=xpos,
830                                   _type=match.group(1).lower(), number=int(match.group(2)))
831                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
832
833                # Check if line is list and group them according to indentation
834                elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None:
835                    current = unindent(xpos, current, newlines)
836                    content_start = len(match.group(0)) - 2
837                    xpos = round(obj.chars[content_start].bbox.left)
838                    name = "listb"
839                    value = lcontent[0]
840                    if value in {"–", "-"}: name = "lists"
841                    elif value.isalpha(): name = "lista"
842                    elif value.isnumeric():
843                        name = "listn"
844                        value = int(match.group(2))
845                    current = Node(name, parent=current, obj=obj, xpos=xpos, value=value)
846                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
847
848                # Check if line is a register bit definition
849                elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None:
850                    if obj.contains_font("Bold"):
851                        # Use the bold character as delimiter
852                        content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font)
853                    else:
854                        # Default back to the regex
855                        if "Reserved" not in content:
856                            LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}")
857                        content_start = re.match(r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content)
858                        if content_start is None:
859                            LOGGER.error(f"Unable to match Bit regex at all! '{content}'!")
860                            content_start = 0
861                        else:
862                            content_start = len(content_start.group(0))
863                        if not content_start:
864                            LOGGER.error(f"Missing content start (=0)! '{content}'!")
865                        content_start = min(content_start, len(obj.chars) - 1)
866
867                    current = next((c for c in current.iter_path_reverse()
868                                    if c.name.startswith("head")), root)
869                    middle = obj.chars[content_start].bbox.left
870                    xpos = round(middle)
871                    current = Node("bit", parent=current, obj=obj, xpos=xpos, _page=self,
872                                   _middle=middle, _left=area.left, _right=area.right)
873                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
874
875                # Check if this is a new paragraph
876                elif newlines >= 2 or current.name not in {"para"}:
877                    # Fix issues where notes are reflowing back left of Note: text
878                    if parent_name(current) in {"note"}:
879                        if xpos < current.parent.xpos:
880                            xpos = current.parent.xpos
881                    # Prevent multiline
882                    current = unindent(xpos, current, newlines)
883                    current = Node("para", parent=current, obj=obj,
884                                   xpos=xpos if current.is_root else current.xpos)
885
886                elif (parent_name(current) not in {"caption", "bit", "area"}):
887                    current = unindent(xpos, current, newlines)
888
889                # Add the actual line
890                Node("line", parent=current, obj=obj, xpos=xpos,
891                     start=content_start, str=content[content_start:50])
892
893                ypos = obj.origin
894
895        return root
896
897    def __repr__(self) -> str:
898        return f"StPage({self.number})"
LOGGER = <Logger modm_data.pdf2html.stmicro.page (WARNING)>
def is_compatible(document) -> bool:
22def is_compatible(document) -> bool:
23    if "stmicro" in document.metadata.get("Author", "").lower():
24        return True
25    return False
def areas_black_white(page) -> dict:
28def areas_black_white(page) -> dict:
29    def _scale(r):
30        if page.rotation:
31            return Rectangle(r.bottom * page.width, (1 - r.right) * page.height,
32                             r.top * page.width, (1 - r.left) * page.height)
33        return Rectangle(r.left * page.width, r.bottom * page.height,
34                         r.right * page.width, r.top * page.height)
35
36    bottom_left = Rectangle(0.1, 0.1, 0.3, 0.12)
37    bottom_middle = Rectangle(0.3, 0.1, 0.7, 0.12)
38    bottom_right = Rectangle(0.7, 0.1, 0.9, 0.12)
39    top = Rectangle(0.1, 0.9125, 0.9, 0.9375)
40    content = Rectangle(0.025, 0.12, 0.975, 0.905 if page.index else 0.79)
41    all_content = [content]
42    areas = {
43        # Bottom string in the middle: Example "RM0410 Rev 4"
44        "id": bottom_middle,
45    }
46    if page.index == 0:
47        # Publish date on the bottom left on first page
48        areas["date"] = bottom_left
49        # number on the bottom right on first page
50        areas["number"] = bottom_right
51        # Add top areas
52        all_content.insert(0, Rectangle(0.375, 0.855, 0.975, 0.9125))
53        all_content.insert(1, Rectangle(0.025, 0.805, 0.975, 0.855))
54    else:
55        # Page number on bottom
56        areas["number"] = bottom_left if page.index % 2 else bottom_right
57        # Chapter name on top
58        areas["top"] = top
59
60    # Recognize the two column design of the Datasheets with a big table underneath
61    if page.index < 3 and "DS" in page.pdf.name:
62        # Find a wide path that would denote the beginning of a table
63        top_rect = [p.bbox.top / page.height for p in page.paths
64                    if _scale(content).contains(p.bbox) and p.bbox.width > page.width * 0.75]
65        if top_rect:
66            # offset for table label just above it
67            ybottom = max(*top_rect) + 0.0175
68        else:
69            ybottom = content.bottom
70        # Try to find list or sublists in these areas
71        mr = Rectangle(0.49, ybottom, 0.51, content.top)
72        br = Rectangle(0.51, ybottom, 0.5325, content.top)
73        hr = Rectangle(0.5325, ybottom, 0.555, content.top)
74        text_middle = page.text_in_area(_scale(mr))
75        text_bullets = page.text_in_area(_scale(br))
76        text_hyphens = page.text_in_area(_scale(hr))
77        if (not text_middle and
78            (any(c in text_bullets for c in {"•", chr(61623)}) or
79             any(c in text_hyphens for c in {"-"}))):
80            areas["middle_bullets"] = br
81            areas["middle_hyphens"] = hr
82            all_content = all_content[:-1]
83            all_content.append(Rectangle(content.left, ybottom, 0.5, content.top))
84            all_content.append(Rectangle(0.505, ybottom, content.right, content.top))
85            if top_rect:
86                all_content.append(Rectangle(content.left, content.bottom, content.right, ybottom))
87
88    areas["content"] = all_content
89    scaled_areas = {}
90    for name, area in areas.items():
91        if isinstance(area, list):
92            scaled_areas[name] = [_scale(r) for r in area]
93        else:
94            scaled_areas[name] = _scale(area)
95    return scaled_areas
def areas_blue_gray(page) -> dict:
 98def areas_blue_gray(page) -> dict:
 99    def _scale(r):
100        return Rectangle(r.left * page.width, r.bottom * page.height,
101                         r.right * page.width, r.top * page.height)
102
103    # This template doesn't use rotated pages, instead uses
104    # hardcoded rotated page dimensions
105    if page.width > page.height:
106        content = Rectangle(0.05, 0.025, 0.89, 0.975)
107        bottom_left = Rectangle(0, 0.6, 0.05, 1)
108        top_right = Rectangle(0.9025, 0.05, 0.9175, 0.7)
109    else:
110        content = Rectangle(0.025, 0.05, 0.975, 0.89 if page.index else 0.81)
111        bottom_left = Rectangle(0, 0, 0.4, 0.05)
112        top_right = Rectangle(0.3, 0.9025, 0.95, 0.9175)
113    areas = {
114        "id": bottom_left,
115        "top": top_right,
116        "all_content": content,
117        "content": []
118    }
119    if page.index == 0:
120        areas["content"] = [
121            # Document device string
122            Rectangle(0.4, 0.91, 0.95, 0.95),
123            # Document description string
124            Rectangle(0.05, 0.81, 0.95, 0.86)
125        ]
126    if page.index < 10:
127        # Contains only a table with product summary
128        br = Rectangle(0.35, content.bottom, 0.37, content.top)
129        text_bullets = page.text_in_area(_scale(br))
130        if any(c in text_bullets for c in {"•", chr(61623)}):
131            areas["middle_bullets"] = br
132            # Contains the actual content here
133            left = Rectangle(content.left, content.bottom, 0.3565, content.top)
134            right = Rectangle(0.3565, content.bottom, content.right, content.top)
135            areas["content"].extend([left, right])
136        else:
137            areas["content"] = [content]
138    else:
139        areas["content"] = [content]
140
141    scaled_areas = {}
142    for name, area in areas.items():
143        if isinstance(area, list):
144            scaled_areas[name] = [_scale(r) for r in area]
145        else:
146            scaled_areas[name] = _scale(area)
147    return scaled_areas
def spacing_black_white(page) -> dict:
150def spacing_black_white(page) -> dict:
151    content = 0.1125
152    spacing = {
153        # Horizontal spacing: left->right
154        "x_em": 0.01 * page.width,
155        "x_left": content * page.width,
156        "x_right": (1 - content) * page.width,
157        "x_content": 0.2075 * page.width,
158        # Vertical spacing: bottom->top
159        "y_em": 0.01 * page.height,
160        # Max table line thickness
161        "y_tline": 0.005 * page.height,
162        # Max line height distance to detect paragraphs
163        "lh": 0.9,
164        # Max line height distance to detect super-/subscript
165        "sc": 0.325,
166        # Table header cell bold text threshold
167        "th": 0.33,
168    }
169    if page.rotation:
170        content = 0.14
171        spacing.update({
172            "x_em": 0.01 * page.height,
173            "y_em": 0.01 * page.width,
174            "x_left": content * page.width,
175            "x_right": (1 - content) * page.width,
176            "x_content": 0.2075 * page.width,
177            "y_tline": 0.005 * page.width,
178            "lh": 1.2,
179            "sc": 0.4,
180        })
181    return spacing
def spacing_blue_gray(page) -> dict:
184def spacing_blue_gray(page) -> dict:
185    content = 0.07
186    spacing = {
187        # Horizontal spacing: left->right
188        "x_em": 0.01 * page.width,
189        "x_left": content * page.width,
190        "x_right": (1 - content) * page.width,
191        "x_content": 0.165 * page.width,
192        # Vertical spacing: bottom->top
193        "y_em": 0.01 * page.height,
194        # Max table line thickness
195        "y_tline": 0.005 * page.height,
196        # Max line height distance to detect paragraphs
197        "lh": 0.9,
198        # Max line height distance to detect super-/subscript
199        "sc": 0.3,
200        # Table header cell bold text threshold
201        "th": 0.33,
202    }
203    if page.rotation:
204        spacing.update({
205            "x_em": 0.01 * page.height,
206            "y_em": 0.01 * page.width,
207            "x_left": 0.05 * page.width,
208            "x_right": (1 - 0.16) * page.width,
209            "x_content": 0.2075 * page.width,
210            "y_tline": 0.005 * page.width,
211            "lh": 1.6,
212            "sc": 0.2,
213        })
214    return spacing
def linesize_black_white(line: float) -> str:
217def linesize_black_white(line: float) -> str:
218    rsize = line.height
219    if rsize >= 17.5: return "h1"
220    elif rsize >= 15.5: return "h2"
221    elif rsize >= 13.5: return "h3"
222    elif rsize >= 11.4: return "h4"
223    elif rsize >= 8.5: return "n"
224    else: return "fn"
def linesize_blue_gray(line: float) -> str:
227def linesize_blue_gray(line: float) -> str:
228    rsize = round(line.height)
229    if rsize >= 16: return "h1"
230    elif rsize >= 14: return "h2"
231    elif rsize >= 12: return "h3"
232    elif rsize >= 10: return "h4"
233    elif rsize >= 7: return "n"
234    else: return "fn"
def colors_black_white(color: int) -> str:
237def colors_black_white(color: int) -> str:
238    if 0xff <= color <= 0xff:
239        return "black"
240    if 0xffffffff <= color <= 0xffffffff:
241        return "white"
242    return "unknown"
def colors_blue_gray(color: int) -> str:
245def colors_blue_gray(color: int) -> str:
246    if 0xff <= color <= 0xff:
247        return "black"
248    if 0xffffffff <= color <= 0xffffffff:
249        return "white"
250    if 0xb9c4caff <= color <= 0xb9c4caff:
251        return "gray"
252    if 0x1f81afff <= color <= 0x1f81afff:
253        return "lightblue"
254    if 0x2052ff <= color <= 0x2052ff:
255        return "darkblue"
256    if 0x39a9dcff <= color <= 0x39a9dcff:
257        return "blue"
258    return "unknown"
class Page(modm_data.pdf.page.Page):
261class Page(PdfPage):
262
263    def __init__(self, document, index: int):
264        super().__init__(document, index)
265        self._template = "black_white"
266        producer = self.pdf.metadata.get("Producer", "").lower()
267        if "acrobat" in producer:
268            pass # default
269        elif "antenna" in producer:
270            self._template = "blue_gray"
271        else:
272            LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
273
274        if "blue_gray" in self._template:
275            self._areas = areas_blue_gray(self)
276            self._spacing = spacing_blue_gray(self)
277            self._colors = colors_blue_gray
278            self._line_size = linesize_blue_gray
279        elif "black_white" in self._template:
280            self._areas = areas_black_white(self)
281            self._spacing = spacing_black_white(self)
282            self._colors = colors_black_white
283            self._line_size = linesize_black_white
284
285        # Patches to detect the header cells correctly
286        if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or
287            (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))):
288            self._spacing["th"] = 0.1
289        if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or
290            (self.pdf.name == "RM0456-v2" and self.index in [2881]) or
291            (self.pdf.name == "RM0456-v3" and self.index in [2880]) or
292            (self.pdf.name == "RM0461-v4" and self.index in [1246])):
293            self._spacing["th"] = 0.5
294        if ((self.pdf.name == "RM0456-v2" and self.index in [3005])):
295            self._spacing["th"] = 0.52
296
297    def _text_in_area(self, name, check_length=True) -> str:
298        if name not in self._areas: return ""
299        text = ""
300        areas = self._areas[name]
301        if not isinstance(areas, list): areas = [areas]
302        for area in areas:
303            text += self.text_in_area(area)
304        if check_length: assert text
305        return text
306
307    @cached_property
308    def identifier(self) -> str:
309        return self._text_in_area("id", check_length=False)
310
311    @cached_property
312    def top(self) -> str:
313        if self.index == 0:
314            return "Cover"
315        return self._text_in_area("top", check_length=False)
316
317    def is_relevant(self) -> bool:
318        if any(c in self.top for c in {"Contents", "List of ", "Index"}):
319            return False
320        return True
321
322    def _charlines_filtered(self, area, predicate = None, rtol = None) -> list[CharLine]:
323        if rtol is None: rtol = self._spacing["sc"]
324        # Split all chars into lines based on rounded origin
325        origin_lines_y = defaultdict(list)
326        origin_lines_x = defaultdict(list)
327        for char in self.chars_in_area(area):
328            # Ignore all characters we don't want
329            if predicate is not None and not predicate(char):
330                continue
331            # Ignore Carriage Return characters and ® (superscript issues)
332            if char.unicode in {0xd, ord("®")}:
333                continue
334            # Correct some weird unicode stuffing choices
335            if char.unicode in {2}:
336                char.unicode = ord("-")
337            if char.unicode in {61623, 61664}:
338                char.unicode = ord("•")
339            if char.unicode < 32 and char.unicode not in {0xa}:
340                continue
341            # Ignore characters without width that are not spaces
342            if not char.width and char.unicode not in {0xa, 0xd, 0x20}:
343                LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
344            # Split up the chars depending on the orientation
345            if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
346                origin_lines_x[round(char.origin.x, 1)].append(char)
347            elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
348                origin_lines_y[round(char.origin.y, 1)].append(char)
349            else:
350                LOGGER.error("Unknown char rotation:", char, char.rotation)
351
352        # Convert characters into lines
353        bbox_lines_y = []
354        for chars in origin_lines_y.values():
355            # Remove lines with whitespace only
356            if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
357                continue
358            origin = statistics.fmean(c.origin.y for c in chars)
359            line = CharLine(self, chars,
360                            min(c.bbox.bottom for c in chars),
361                            origin,
362                            max(c.bbox.top for c in chars),
363                            max(c.height for c in chars),
364                            sort_origin=self.height - origin)
365            bbox_lines_y.append(line)
366            # print(line, line.top, line.origin, line.bottom, line.height)
367        bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin)
368
369        bbox_lines_x = []
370        for chars in origin_lines_x.values():
371            # Remove lines with whitespace only
372            if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
373                continue
374            line = CharLine(self, chars,
375                            min(c.bbox.left for c in chars),
376                            statistics.fmean(c.origin.x for c in chars),
377                            max(c.bbox.right for c in chars),
378                            max(c.width for c in chars),
379                            270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90)
380            bbox_lines_x.append(line)
381        bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin)
382
383        if not bbox_lines:
384            return []
385
386        # Merge lines that have overlapping bbox_lines
387        # FIXME: This merges lines that "collide" vertically like in formulas
388        merged_lines = []
389        current_line = bbox_lines[0]
390        for next_line in bbox_lines[1:]:
391            height = max(current_line.height, next_line.height)
392            # Calculate overlap via normalize origin (increasing with line index)
393            if ((current_line._sort_origin + rtol * height) >
394                (next_line._sort_origin - rtol * height)):
395                # if line.rotation or self.rotation:
396                #     # The next line overlaps this one, we merge the shorter line
397                #     # (typically super- and subscript) into taller line
398                #     use_current = len(current_line.chars) >= len(next_line.chars)
399                # else:
400                use_current = current_line.height >= next_line.height
401                line = current_line if use_current else next_line
402                current_line = CharLine(self, current_line.chars + next_line.chars,
403                                        line.bottom, line.origin, line.top,
404                                        height, line.rotation,
405                                        sort_origin=line._sort_origin)
406            else:
407                # The next line does not overlap the current line
408                merged_lines.append(current_line)
409                current_line = next_line
410        # append last line
411        merged_lines.append(current_line)
412
413        # Sort all lines horizontally based on character origin
414        sorted_lines = []
415        for line in merged_lines:
416            if line.rotation == 90:
417                def sort_key(char):
418                    if char.unicode in {0xa, 0xd}:
419                        return char.tbbox.midpoint.y - 1e9
420                    return char.tbbox.midpoint.y
421            elif line.rotation == 270:
422                def sort_key(char):
423                    if char.unicode in {0xa, 0xd}:
424                        return -char.tbbox.midpoint.y + 1e9
425                    return -char.tbbox.midpoint.y
426            else:
427                def sort_key(char):
428                    if char.unicode in {0xa, 0xd}:
429                        return char.origin.x + 1e9
430                    return char.origin.x
431            sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key),
432                                         line.bottom, line.origin,
433                                         line.top, line.height,
434                                         line.rotation, area.left,
435                                         sort_origin=line._sort_origin))
436
437        return sorted_lines
438
439    def _content_areas(self, area: Rectangle, with_graphics: bool = True) -> list:
440        if with_graphics:
441            graphics = self._graphics_filtered(area)
442            regions = []
443            for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
444                gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
445                for reg in regions:
446                    if reg.overlaps(gbbox.bottom, gbbox.top):
447                        # They overlap, so merge them
448                        reg.v0 = min(reg.v0, gbbox.bottom)
449                        reg.v1 = max(reg.v1, gbbox.top)
450                        reg.objs.append(graphic)
451                        break
452                else:
453                    regions.append(Region(gbbox.bottom, gbbox.top, graphic))
454
455            # print(regions)
456            areas = []
457            ypos = area.top
458            for reg in regions:
459                if ypos - reg.v1 > self._spacing["y_em"]:
460                    areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
461                for obj in reg.objs:
462                    oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
463                    areas.append((oarea, obj))
464                ypos = reg.v0
465            areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
466        else:
467            areas = [(area, None)]
468        return areas
469
470    def _objects_filtered(self, area: Rectangle, with_graphics: bool = True) -> list:
471        self._link_characters()
472        areas = self._content_areas(area, with_graphics)
473        objects = []
474        for narea, obj in areas:
475            if obj is None:
476                objects += self._charlines_filtered(narea)
477            else:
478                oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
479                predicate = lambda c: not obj.bbox.contains(c.origin)
480                lines = self._charlines_filtered(oarea, predicate)
481                # print(obj, oarea, lines, [line.content for line in lines])
482                objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
483        return objects
484
485    @property
486    def content_ast(self) -> list:
487        ast = []
488        with_graphics = True
489        if "DS" in self.pdf.name:
490            # FIXME: Terrible hack to get the ordering information table fixed
491            # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable
492            order_page = next((item.page_index for item in self.pdf.toc if item.level == 0 and
493                               re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1)
494            with_graphics = (order_page != self.index)
495        for area in self._areas["content"]:
496            ast.append(self._ast_filtered(area, with_graphics=with_graphics))
497        # Add a page node to the first leaf to keep track of where a page starts
498        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
499        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
500        return ast
501
502    def _graphics_filtered(self, area) -> list:
503        # Find all graphic clusters in this area
504        em = self._spacing["y_em"]
505        large_area = area.offset_x(em/2)
506        graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em/2)
507        # for bbox, paths in raw_graphic_clusters:
508        #     # Some docs have large DRAFT chars in the background
509        #     if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths):
510        #         continue
511        #     graphic_clusters.append((bbox, paths))
512
513        # Find the captions and group them by y origin to catch side-by-side figures
514        ycaptions = defaultdict(list)
515        for line in self._charlines_filtered(area, lambda c: "Bold" in c.font):
516            for cluster in line.clusters():
517                for phrase in [r"Figure \d+\.", r"Table \d+\."]:
518                    if re.match(phrase, cluster.content):
519                        ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars))
520        ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)]
521
522        # Now associate these captions with the graphics bboxes
523        categories = []
524        for captions in ycaptions:
525            width = area.width / len(captions)
526            for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)):
527                left, right = area.left + ii * width, area.left + (ii + 1) * width
528                bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height
529
530                # Find the graphic associated with this caption
531                graphic = next(((b, p) for b, p in graphic_clusters
532                                if b.bottom <= bottom and
533                                   left <= b.left and b.right <= right), None)
534                if graphic is None:
535                    LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}")
536                    continue
537
538                if self._template == "blue_gray":
539                    # Search for all lines of the current caption with the same properties
540                    cbbox = Rectangle(left, bottom, right, top)
541                    cchars = self.chars_in_area(cbbox)
542                    while True:
543                        nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top)
544                        nchars = self.chars_in_area(nbbox)
545                        if len(cchars) >= len(nchars):
546                            break
547                        cbbox = nbbox
548                        cchars = nchars
549                elif self._template == "black_white":
550                    cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top)
551
552                otype = phrase.split(" ")[0].lower()
553                if "Figure" in phrase:
554                    # Find all other graphics in the bounding box
555                    gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom)
556                    graphics = []
557                    for b, p in graphic_clusters:
558                        if gbbox.overlaps(b):
559                            graphics.append((b,p))
560                    for g in graphics:
561                        graphic_clusters.remove(g)
562                    gbbox = [cluster[0] for cluster in graphics]
563                    gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox)
564                    paths = [p for cluster in graphics for p in cluster[1]]
565
566                    if self._template == "blue_gray":
567                        # Search for characters below the graphics bbox, max 1 y_em
568                        gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom)
569                        while True:
570                            gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom)
571                            if not self.chars_in_area(gbbox):
572                                break
573                    # Generate the new bounding box which includes the caption
574                    gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom)
575                elif "Table" in phrase:
576                    graphic_clusters.remove(graphic)
577                    gbbox, paths = graphic
578                    if (self._template == "black_white" and
579                        sum(1 for path in paths if path.count == 2) >= len(paths) / 2):
580                        otype += "_lines"
581                categories.append((otype, cbbox, gbbox, paths))
582
583        # Deal with the remaining graphic categories
584        for gbbox, paths in graphic_clusters:
585            if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]:
586                continue
587            if any(isinstance(p, Image) for p in paths):
588                category = "figure"
589            elif self._template == "blue_gray":
590                if all(self._colors(path.stroke) == "gray" or
591                       self._colors(path.fill) == "darkblue" for path in paths):
592                    category = "table"
593                else:
594                    category = "figure"
595            elif self._template == "black_white":
596                # Some tables are rendered explicitly with filled rectangular
597                # shapes with others are implicitly rendered with stroked lines
598                stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2
599                is_table = stroked_table_lines or all(
600                    [any(p.isclose(pp) for pp in path.bbox.points)
601                     for p in path.points].count(True) >= len(path.points) * 2 / 3
602                    for path in paths)
603                if (len(paths) > 1 and is_table):
604                    category = "table"
605                    if stroked_table_lines:
606                        category += "_lines"
607                else:
608                    category = "figure"
609
610            if "table" in category:
611                # Check if there are only numbers on top of the table
612                cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"])
613                nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xa, 0xd}]
614
615                if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3:
616                    # This is a register table with invisible top borders!
617                    cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars))
618                    gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top)
619                    name = "register_" + category
620                else:
621                    cbbox = None
622                    name = category
623                categories.append((name, cbbox, gbbox, paths))
624            else:
625                categories.append(("figure", None, gbbox, paths))
626
627        # Convert the objects into specialized classes
628        categories.sort(key=lambda o: (-o[2].y, o[2].x))
629        objects = []
630        for otype, caption_bbox, graphics_bbox, graphics_paths in categories:
631            if "figure" in otype:
632                figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths)
633                objects.append(figure)
634            elif "table" in otype:
635                xlines, ylines, yhlines = [], [], []
636                for path in graphics_paths:
637                    if self._template == "blue_gray" or "_lines" in otype:
638                        if self._colors(path.stroke) == "gray" or "_lines" in otype:
639                            # Intercell paths in gray
640                            if len(path.lines) == 1:
641                                line = path.lines[0]
642                                if line.direction == line.Direction.VERTICAL:
643                                    xlines.append(line.specialize())
644                                elif line.direction == line.Direction.HORIZONTAL:
645                                    ylines.append(line.specialize())
646                                else:
647                                    LOGGER.warn(f"Line not vertical or horizontal: {line}")
648                            else:
649                                LOGGER.warn(f"Path too long: {path}")
650                        elif self._colors(path.fill) == "darkblue":
651                            # Add the bottom line of the dark blue header box as a very thick line
652                            line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5)
653                            yhlines.append(line)
654
655                    elif self._template == "black_white":
656                        bbox = path.bbox
657                        is_vertical = bbox.width < bbox.height
658                        width = bbox.width if is_vertical else bbox.height
659                        length = bbox.height if is_vertical else bbox.width
660                        if width <= self._spacing["x_em"] / 2:
661                            if length >= self._spacing["y_em"] / 2:
662                                if is_vertical:
663                                    line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width)
664                                    xlines.append(line)
665                                else:
666                                    line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height)
667                                    ylines.append(line)
668                        else:
669                            # Split the rectangle into it's outline
670                            xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1))
671                            xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1))
672                            ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1))
673                            ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1))
674                if yhlines:
675                    yhlines.sort(key=lambda l: l.p0.y)
676                    ylines.append(yhlines[0])
677                if not xlines or not ylines:
678                    continue
679                table = Table(self, graphics_bbox, xlines, ylines, caption_bbox,
680                              is_register="register" in otype)
681                objects.append(table)
682
683        return objects
684
685    @property
686    def content_objects(self) -> list:
687        objs = []
688        for area in self._areas["content"]:
689            objs.extend(self._objects_filtered(area))
690        return objs
691
692    @property
693    def content_graphics(self) -> list:
694        objs = []
695        for area in self._areas["content"]:
696            objs.extend(self._graphics_filtered(area))
697        return objs
698
699    @property
700    def content_lines(self) -> list:
701        return [o for o in self.content_objects if isinstance(o, CharLine)]
702
703    @property
704    def content_tables(self) -> list:
705        return [o for o in self.content_graphics if isinstance(o, Table)]
706
707    @property
708    def content_figures(self) -> list:
709        return [o for o in self.content_graphics if isinstance(o, Figure)]
710
711    def _char_properties(self, line, char):
712        cp = {
713            "superscript": False,
714            "subscript": False,
715            "bold": any(frag in char.font for frag in {"Bold"}),
716            "italic": any(frag in char.font for frag in {"Italic", "Oblique"}),
717            "underline": (char.objlink or char.weblink) is not None,
718            "size": round(line.height),
719            "relsize": self._line_size(line),
720            "char": chr(char.unicode),
721        }
722
723        if line.rotation:
724            if char.origin.x < (line.origin - 0.25 * line.height):
725                cp["superscript"] = True
726            elif char.origin.x > (line.origin + 0.15 * line.height):
727                cp["subscript"] = True
728        elif char.origin.y > (line.origin + 0.25 * line.height):
729            cp["superscript"] = True
730        elif char.origin.y < (line.origin - 0.15 * line.height):
731            cp["subscript"] = True
732
733        return cp
734
735    def _ast_filtered(self, area: Rectangle, with_graphics=True,
736                      ignore_xpos=False, with_bits=True, with_notes=True) -> list:
737        x_em = self._spacing["x_em"]
738        spacing_content = self._spacing["x_content"]
739        lh_factor = self._spacing["lh"]
740        # spacing_y = self._spacing["y_em"]
741        root = Node("area", obj=area, xpos=int(area.left), page=self)
742
743        def unindent(_xpos, _current, _newlines=1):
744            current = _current
745            # Check if we need to unindent the current node
746            while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos:
747                current = current.parent
748            if _newlines >= 2 and current.name == "para":
749                current = current.parent
750            return current
751
752        def parent_name(current):
753            return "" if current.parent is None else current.parent.name
754
755        current = root
756        ypos = area.top
757        for obj in self._objects_filtered(area, with_graphics):
758            xpos = round(obj.bbox.left)
759            # Tables should remain in their current hierarchy regardless of indentation
760            if isinstance(obj, (Table, Figure)):
761                current = next((c for c in current.iter_path_reverse()
762                                if c.name.startswith("head")), root)
763                name = "figure" if isinstance(obj, Figure) else "table"
764                Node(name, parent=current, obj=obj, xpos=xpos, number=-1,
765                     _width=obj.bbox.width / area.width, _type=obj._type)
766                ypos = obj.bbox.bottom
767            # Lines of text need to be carefully checked for indentation
768            elif isinstance(obj, CharLine):
769                newlines = round((ypos - obj.origin) / (lh_factor * obj.height))
770                content = obj.content
771                lcontent = content.lstrip()
772                content_start = 0
773                linesize = self._line_size(obj)
774
775                # Check when the note has finished (=> paragraphs without italic)
776                if (parent_name(current) == "note" and
777                    ((current.parent.type == "note" and not obj.contains_font(current.parent._font)) or
778                     (current.parent.type in {"caution", "warning"} and newlines >= 2))):
779                    current = current.parent.parent
780
781                # Check when the list ends into something indented far too right
782                elif (parent_name(current).startswith("list")
783                      and (xpos - current.xpos) >= 2 * x_em):
784                    current = current.parent.parent
785
786                # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content)
787                # Check if line is a heading, which may be multi-line, so we must
788                # be careful not to nest them, but group them properly
789                # Headings are always inserted into the root note!
790                if linesize.startswith("h1") or (linesize.startswith("h") and
791                        xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font):
792                    if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None:
793                        start = min(len(match.group(0)), len(obj.chars) - 1)
794                        marker = match.group(1)
795                        size = marker.count('.') + 2
796                    else:
797                        start = 0
798                        marker = None
799                        size = linesize[1]
800                    name = f"head{size}"
801                    # Check if we're already parsing a heading, do not split into two
802                    if parent_name(current) != name or newlines > 2:
803                        content_start = start
804                        xpos = round(obj.chars[content_start].bbox.left)
805                        current = Node(name, parent=root, obj=obj, xpos=xpos,
806                                       size=size, marker=marker)
807                        current = Node("para", parent=current, obj=obj, xpos=current.xpos)
808
809                # Check if the line is a note and deal with the indentation correctly
810                elif with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None:
811                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
812                    # print(obj.fonts)
813                    # Correct xposition only if the Note: string is very far left
814                    if xpos + 4 * x_em <= current.xpos:
815                        xpos = round(obj.chars[content_start].bbox.left)
816                    # Prevent nesting of notes, they should only be listed
817                    if parent_name(current) == "note":
818                        current =  current.parent.parent
819                    current = unindent(xpos, current, 2)
820                    current = Node("note", parent=current, obj=obj, xpos=xpos,
821                                   type=match.group(1).lower(), _font=obj.chars[content_start].font)
822                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
823
824                # Check if line is Table or Figure caption
825                elif with_graphics and ((match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None
826                      and "Bold" in obj.chars[0].font):
827                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
828                    current = next((c for c in current.iter_path_reverse()
829                                if c.name.startswith("head")), root)
830                    current = Node("caption", parent=current, obj=obj, xpos=xpos,
831                                   _type=match.group(1).lower(), number=int(match.group(2)))
832                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
833
834                # Check if line is list and group them according to indentation
835                elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None:
836                    current = unindent(xpos, current, newlines)
837                    content_start = len(match.group(0)) - 2
838                    xpos = round(obj.chars[content_start].bbox.left)
839                    name = "listb"
840                    value = lcontent[0]
841                    if value in {"–", "-"}: name = "lists"
842                    elif value.isalpha(): name = "lista"
843                    elif value.isnumeric():
844                        name = "listn"
845                        value = int(match.group(2))
846                    current = Node(name, parent=current, obj=obj, xpos=xpos, value=value)
847                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
848
849                # Check if line is a register bit definition
850                elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None:
851                    if obj.contains_font("Bold"):
852                        # Use the bold character as delimiter
853                        content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font)
854                    else:
855                        # Default back to the regex
856                        if "Reserved" not in content:
857                            LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}")
858                        content_start = re.match(r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content)
859                        if content_start is None:
860                            LOGGER.error(f"Unable to match Bit regex at all! '{content}'!")
861                            content_start = 0
862                        else:
863                            content_start = len(content_start.group(0))
864                        if not content_start:
865                            LOGGER.error(f"Missing content start (=0)! '{content}'!")
866                        content_start = min(content_start, len(obj.chars) - 1)
867
868                    current = next((c for c in current.iter_path_reverse()
869                                    if c.name.startswith("head")), root)
870                    middle = obj.chars[content_start].bbox.left
871                    xpos = round(middle)
872                    current = Node("bit", parent=current, obj=obj, xpos=xpos, _page=self,
873                                   _middle=middle, _left=area.left, _right=area.right)
874                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
875
876                # Check if this is a new paragraph
877                elif newlines >= 2 or current.name not in {"para"}:
878                    # Fix issues where notes are reflowing back left of Note: text
879                    if parent_name(current) in {"note"}:
880                        if xpos < current.parent.xpos:
881                            xpos = current.parent.xpos
882                    # Prevent multiline
883                    current = unindent(xpos, current, newlines)
884                    current = Node("para", parent=current, obj=obj,
885                                   xpos=xpos if current.is_root else current.xpos)
886
887                elif (parent_name(current) not in {"caption", "bit", "area"}):
888                    current = unindent(xpos, current, newlines)
889
890                # Add the actual line
891                Node("line", parent=current, obj=obj, xpos=xpos,
892                     start=content_start, str=content[content_start:50])
893
894                ypos = obj.origin
895
896        return root
897
898    def __repr__(self) -> str:
899        return f"StPage({self.number})"

This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.

Page(document, index: int)
263    def __init__(self, document, index: int):
264        super().__init__(document, index)
265        self._template = "black_white"
266        producer = self.pdf.metadata.get("Producer", "").lower()
267        if "acrobat" in producer:
268            pass # default
269        elif "antenna" in producer:
270            self._template = "blue_gray"
271        else:
272            LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
273
274        if "blue_gray" in self._template:
275            self._areas = areas_blue_gray(self)
276            self._spacing = spacing_blue_gray(self)
277            self._colors = colors_blue_gray
278            self._line_size = linesize_blue_gray
279        elif "black_white" in self._template:
280            self._areas = areas_black_white(self)
281            self._spacing = spacing_black_white(self)
282            self._colors = colors_black_white
283            self._line_size = linesize_black_white
284
285        # Patches to detect the header cells correctly
286        if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or
287            (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))):
288            self._spacing["th"] = 0.1
289        if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or
290            (self.pdf.name == "RM0456-v2" and self.index in [2881]) or
291            (self.pdf.name == "RM0456-v3" and self.index in [2880]) or
292            (self.pdf.name == "RM0461-v4" and self.index in [1246])):
293            self._spacing["th"] = 0.5
294        if ((self.pdf.name == "RM0456-v2" and self.index in [3005])):
295            self._spacing["th"] = 0.52
Parameters
  • document: a PDF document.
  • index: 0-index page number.
identifier: str
307    @cached_property
308    def identifier(self) -> str:
309        return self._text_in_area("id", check_length=False)
top: str
311    @cached_property
312    def top(self) -> str:
313        if self.index == 0:
314            return "Cover"
315        return self._text_in_area("top", check_length=False)
def is_relevant(self) -> bool:
317    def is_relevant(self) -> bool:
318        if any(c in self.top for c in {"Contents", "List of ", "Index"}):
319            return False
320        return True
content_ast: list
485    @property
486    def content_ast(self) -> list:
487        ast = []
488        with_graphics = True
489        if "DS" in self.pdf.name:
490            # FIXME: Terrible hack to get the ordering information table fixed
491            # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable
492            order_page = next((item.page_index for item in self.pdf.toc if item.level == 0 and
493                               re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1)
494            with_graphics = (order_page != self.index)
495        for area in self._areas["content"]:
496            ast.append(self._ast_filtered(area, with_graphics=with_graphics))
497        # Add a page node to the first leaf to keep track of where a page starts
498        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
499        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
500        return ast
content_objects: list
685    @property
686    def content_objects(self) -> list:
687        objs = []
688        for area in self._areas["content"]:
689            objs.extend(self._objects_filtered(area))
690        return objs
content_graphics: list
692    @property
693    def content_graphics(self) -> list:
694        objs = []
695        for area in self._areas["content"]:
696            objs.extend(self._graphics_filtered(area))
697        return objs
content_lines: list
699    @property
700    def content_lines(self) -> list:
701        return [o for o in self.content_objects if isinstance(o, CharLine)]
content_tables: list
703    @property
704    def content_tables(self) -> list:
705        return [o for o in self.content_graphics if isinstance(o, Table)]
content_figures: list
707    @property
708    def content_figures(self) -> list:
709        return [o for o in self.content_graphics if isinstance(o, Figure)]
Inherited Members
modm_data.pdf.page.Page
index
number
label
width
height
rotation
bbox
char_count
char
chars
chars_in_area
text_in_area
structures
find
paths
images
graphic_clusters
pypdfium2._helpers.page.PdfPage
parent
get_width
get_height
get_size
get_rotation
set_rotation
get_mediabox
set_mediabox
get_cropbox
set_cropbox
get_bleedbox
set_bleedbox
get_trimbox
set_trimbox
get_artbox
set_artbox
get_bbox
get_textpage
insert_obj
remove_obj
gen_content
get_objects
render
pypdfium2.internal.bases.AutoCloseable
close