modm_data.pdf2html.stmicro.page

View Source

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4import re
  5import math
  6import logging
  7import textwrap
  8import statistics
  9from functools import cached_property, cache, reduce
 10from collections import defaultdict
 11from .table import Table
 12from ..figure import Figure
 13from ..line import CharLine
 14from ...utils import HLine, VLine, Rectangle, Region
 15from ...pdf import Path, Image, Page as PdfPage
 16from anytree import Node
 17
 18
 19LOGGER = logging.getLogger(__name__)
 20
 21def is_compatible(document) -> bool:
 22    if "stmicro" in document.metadata.get("Author", "").lower():
 23        return True
 24    return False
 25
 26
 27def areas_black_white(page) -> dict:
 28    def _scale(r):
 29        if page.rotation:
 30            return Rectangle(r.bottom * page.width, (1 - r.right) * page.height,
 31                             r.top * page.width, (1 - r.left) * page.height)
 32        return Rectangle(r.left * page.width, r.bottom * page.height,
 33                         r.right * page.width, r.top * page.height)
 34
 35    bottom_left = Rectangle(0.1, 0.1, 0.3, 0.12)
 36    bottom_middle = Rectangle(0.3, 0.1, 0.7, 0.12)
 37    bottom_right = Rectangle(0.7, 0.1, 0.9, 0.12)
 38    top = Rectangle(0.1, 0.9125, 0.9, 0.9375)
 39    content = Rectangle(0.025, 0.12, 0.975, 0.905 if page.index else 0.79)
 40    all_content = [content]
 41    areas = {
 42        # Bottom string in the middle: Example "RM0410 Rev 4"
 43        "id": bottom_middle,
 44    }
 45    if page.index == 0:
 46        # Publish date on the bottom left on first page
 47        areas["date"] = bottom_left
 48        # number on the bottom right on first page
 49        areas["number"] = bottom_right
 50        # Add top areas
 51        all_content.insert(0, Rectangle(0.375, 0.855, 0.975, 0.9125))
 52        all_content.insert(1, Rectangle(0.025, 0.805, 0.975, 0.855))
 53    else:
 54        # Page number on bottom
 55        areas["number"] = bottom_left if page.index % 2 else bottom_right
 56        # Chapter name on top
 57        areas["top"] = top
 58
 59    # Recognize the two column design of the Datasheets with a big table underneath
 60    if page.index < 3 and "DS" in page.pdf.name:
 61        # Find a wide path that would denote the beginning of a table
 62        top_rect = [p.bbox.top / page.height for p in page.paths
 63                    if _scale(content).contains(p.bbox) and p.bbox.width > page.width * 0.75]
 64        if top_rect:
 65            # offset for table label just above it
 66            ybottom = max(*top_rect) + 0.0175
 67        else:
 68            ybottom = content.bottom
 69        # Try to find list or sublists in these areas
 70        mr = Rectangle(0.49, ybottom, 0.51, content.top)
 71        br = Rectangle(0.51, ybottom, 0.5325, content.top)
 72        hr = Rectangle(0.5325, ybottom, 0.555, content.top)
 73        text_middle = page.text_in_area(_scale(mr))
 74        text_bullets = page.text_in_area(_scale(br))
 75        text_hyphens = page.text_in_area(_scale(hr))
 76        if (not text_middle and
 77            (any(c in text_bullets for c in {"•", chr(61623)}) or
 78             any(c in text_hyphens for c in {"-"}))):
 79            areas["middle_bullets"] = br
 80            areas["middle_hyphens"] = hr
 81            all_content = all_content[:-1]
 82            all_content.append(Rectangle(content.left, ybottom, 0.5, content.top))
 83            all_content.append(Rectangle(0.505, ybottom, content.right, content.top))
 84            if top_rect:
 85                all_content.append(Rectangle(content.left, content.bottom, content.right, ybottom))
 86
 87    areas["content"] = all_content
 88    scaled_areas = {}
 89    for name, area in areas.items():
 90        if isinstance(area, list):
 91            scaled_areas[name] = [_scale(r) for r in area]
 92        else:
 93            scaled_areas[name] = _scale(area)
 94    return scaled_areas
 95
 96
 97def areas_blue_gray(page) -> dict:
 98    def _scale(r):
 99        return Rectangle(r.left * page.width, r.bottom * page.height,
100                         r.right * page.width, r.top * page.height)
101
102    # This template doesn't use rotated pages, instead uses
103    # hardcoded rotated page dimensions
104    if page.width > page.height:
105        content = Rectangle(0.05, 0.025, 0.89, 0.975)
106        bottom_left = Rectangle(0, 0.6, 0.05, 1)
107        top_right = Rectangle(0.9025, 0.05, 0.9175, 0.7)
108    else:
109        content = Rectangle(0.025, 0.05, 0.975, 0.89 if page.index else 0.81)
110        bottom_left = Rectangle(0, 0, 0.4, 0.05)
111        top_right = Rectangle(0.3, 0.9025, 0.95, 0.9175)
112    areas = {
113        "id": bottom_left,
114        "top": top_right,
115        "all_content": content,
116        "content": []
117    }
118    if page.index == 0:
119        areas["content"] = [
120            # Document device string
121            Rectangle(0.4, 0.91, 0.95, 0.95),
122            # Document description string
123            Rectangle(0.05, 0.81, 0.95, 0.86)
124        ]
125    if page.index < 10:
126        # Contains only a table with product summary
127        br = Rectangle(0.35, content.bottom, 0.37, content.top)
128        text_bullets = page.text_in_area(_scale(br))
129        if any(c in text_bullets for c in {"•", chr(61623)}):
130            areas["middle_bullets"] = br
131            # Contains the actual content here
132            left = Rectangle(content.left, content.bottom, 0.3565, content.top)
133            right = Rectangle(0.3565, content.bottom, content.right, content.top)
134            areas["content"].extend([left, right])
135        else:
136            areas["content"] = [content]
137    else:
138        areas["content"] = [content]
139
140    scaled_areas = {}
141    for name, area in areas.items():
142        if isinstance(area, list):
143            scaled_areas[name] = [_scale(r) for r in area]
144        else:
145            scaled_areas[name] = _scale(area)
146    return scaled_areas
147
148
149def spacing_black_white(page) -> dict:
150    content = 0.1125
151    spacing = {
152        # Horizontal spacing: left->right
153        "x_em": 0.01 * page.width,
154        "x_left": content * page.width,
155        "x_right": (1 - content) * page.width,
156        "x_content": 0.2075 * page.width,
157        # Vertical spacing: bottom->top
158        "y_em": 0.01 * page.height,
159        # Max table line thickness
160        "y_tline": 0.005 * page.height,
161        # Max line height distance to detect paragraphs
162        "lh": 0.9,
163        # Max line height distance to detect super-/subscript
164        "sc": 0.325,
165        # Table header cell bold text threshold
166        "th": 0.33,
167    }
168    if page.rotation:
169        content = 0.14
170        spacing.update({
171            "x_em": 0.01 * page.height,
172            "y_em": 0.01 * page.width,
173            "x_left": content * page.width,
174            "x_right": (1 - content) * page.width,
175            "x_content": 0.2075 * page.width,
176            "y_tline": 0.005 * page.width,
177            "lh": 1.2,
178            "sc": 0.4,
179        })
180    return spacing
181
182
183def spacing_blue_gray(page) -> dict:
184    content = 0.07
185    spacing = {
186        # Horizontal spacing: left->right
187        "x_em": 0.01 * page.width,
188        "x_left": content * page.width,
189        "x_right": (1 - content) * page.width,
190        "x_content": 0.165 * page.width,
191        # Vertical spacing: bottom->top
192        "y_em": 0.01 * page.height,
193        # Max table line thickness
194        "y_tline": 0.005 * page.height,
195        # Max line height distance to detect paragraphs
196        "lh": 0.9,
197        # Max line height distance to detect super-/subscript
198        "sc": 0.3,
199        # Table header cell bold text threshold
200        "th": 0.33,
201    }
202    if page.rotation:
203        spacing.update({
204            "x_em": 0.01 * page.height,
205            "y_em": 0.01 * page.width,
206            "x_left": 0.05 * page.width,
207            "x_right": (1 - 0.16) * page.width,
208            "x_content": 0.2075 * page.width,
209            "y_tline": 0.005 * page.width,
210            "lh": 1.6,
211            "sc": 0.2,
212        })
213    return spacing
214
215
216def linesize_black_white(line: float) -> str:
217    rsize = line.height
218    if rsize >= 17.5: return "h1"
219    elif rsize >= 15.5: return "h2"
220    elif rsize >= 13.5: return "h3"
221    elif rsize >= 11.4: return "h4"
222    elif rsize >= 8.5: return "n"
223    else: return "fn"
224
225
226def linesize_blue_gray(line: float) -> str:
227    rsize = round(line.height)
228    if rsize >= 16: return "h1"
229    elif rsize >= 14: return "h2"
230    elif rsize >= 12: return "h3"
231    elif rsize >= 10: return "h4"
232    elif rsize >= 7: return "n"
233    else: return "fn"
234
235
236def colors_black_white(color: int) -> str:
237    if 0xff <= color <= 0xff:
238        return "black"
239    if 0xffffffff <= color <= 0xffffffff:
240        return "white"
241    return "unknown"
242
243
244def colors_blue_gray(color: int) -> str:
245    if 0xff <= color <= 0xff:
246        return "black"
247    if 0xffffffff <= color <= 0xffffffff:
248        return "white"
249    if 0xb9c4caff <= color <= 0xb9c4caff:
250        return "gray"
251    if 0x1f81afff <= color <= 0x1f81afff:
252        return "lightblue"
253    if 0x2052ff <= color <= 0x2052ff:
254        return "darkblue"
255    if 0x39a9dcff <= color <= 0x39a9dcff:
256        return "blue"
257    return "unknown"
258
259
260class Page(PdfPage):
261
262    def __init__(self, document, index: int):
263        super().__init__(document, index)
264        self._template = "black_white"
265        producer = self.pdf.metadata.get("Producer", "").lower()
266        if "acrobat" in producer:
267            pass # default
268        elif "antenna" in producer:
269            self._template = "blue_gray"
270        else:
271            LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
272
273        if "blue_gray" in self._template:
274            self._areas = areas_blue_gray(self)
275            self._spacing = spacing_blue_gray(self)
276            self._colors = colors_blue_gray
277            self._line_size = linesize_blue_gray
278        elif "black_white" in self._template:
279            self._areas = areas_black_white(self)
280            self._spacing = spacing_black_white(self)
281            self._colors = colors_black_white
282            self._line_size = linesize_black_white
283
284        # Patches to detect the header cells correctly
285        if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or
286            (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))):
287            self._spacing["th"] = 0.1
288        if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or
289            (self.pdf.name == "RM0456-v2" and self.index in [2881]) or
290            (self.pdf.name == "RM0456-v3" and self.index in [2880]) or
291            (self.pdf.name == "RM0461-v4" and self.index in [1246])):
292            self._spacing["th"] = 0.5
293        if ((self.pdf.name == "RM0456-v2" and self.index in [3005])):
294            self._spacing["th"] = 0.52
295
296    def _text_in_area(self, name, check_length=True) -> str:
297        if name not in self._areas: return ""
298        text = ""
299        areas = self._areas[name]
300        if not isinstance(areas, list): areas = [areas]
301        for area in areas:
302            text += self.text_in_area(area)
303        if check_length: assert text
304        return text
305
306    @cached_property
307    def identifier(self) -> str:
308        return self._text_in_area("id", check_length=False)
309
310    @cached_property
311    def top(self) -> str:
312        if self.index == 0:
313            return "Cover"
314        return self._text_in_area("top", check_length=False)
315
316    def is_relevant(self) -> bool:
317        if any(c in self.top for c in {"Contents", "List of ", "Index"}):
318            return False
319        return True
320
321    def _charlines_filtered(self, area, predicate = None, rtol = None) -> list[CharLine]:
322        if rtol is None: rtol = self._spacing["sc"]
323        # Split all chars into lines based on rounded origin
324        origin_lines_y = defaultdict(list)
325        origin_lines_x = defaultdict(list)
326        for char in self.chars_in_area(area):
327            # Ignore all characters we don't want
328            if predicate is not None and not predicate(char):
329                continue
330            # Ignore Carriage Return characters and ® (superscript issues)
331            if char.unicode in {0xd, ord("®")}:
332                continue
333            # Correct some weird unicode stuffing choices
334            if char.unicode in {2}:
335                char.unicode = ord("-")
336            if char.unicode in {61623, 61664}:
337                char.unicode = ord("•")
338            if char.unicode < 32 and char.unicode not in {0xa}:
339                continue
340            # Ignore characters without width that are not spaces
341            if not char.width and char.unicode not in {0xa, 0xd, 0x20}:
342                LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
343            # Split up the chars depending on the orientation
344            if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
345                origin_lines_x[round(char.origin.x, 1)].append(char)
346            elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
347                origin_lines_y[round(char.origin.y, 1)].append(char)
348            else:
349                LOGGER.error("Unknown char rotation:", char, char.rotation)
350
351        # Convert characters into lines
352        bbox_lines_y = []
353        for chars in origin_lines_y.values():
354            # Remove lines with whitespace only
355            if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
356                continue
357            origin = statistics.fmean(c.origin.y for c in chars)
358            line = CharLine(self, chars,
359                            min(c.bbox.bottom for c in chars),
360                            origin,
361                            max(c.bbox.top for c in chars),
362                            max(c.height for c in chars),
363                            sort_origin=self.height - origin)
364            bbox_lines_y.append(line)
365            # print(line, line.top, line.origin, line.bottom, line.height)
366        bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin)
367
368        bbox_lines_x = []
369        for chars in origin_lines_x.values():
370            # Remove lines with whitespace only
371            if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
372                continue
373            line = CharLine(self, chars,
374                            min(c.bbox.left for c in chars),
375                            statistics.fmean(c.origin.x for c in chars),
376                            max(c.bbox.right for c in chars),
377                            max(c.width for c in chars),
378                            270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90)
379            bbox_lines_x.append(line)
380        bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin)
381
382        if not bbox_lines:
383            return []
384
385        # Merge lines that have overlapping bbox_lines
386        # FIXME: This merges lines that "collide" vertically like in formulas
387        merged_lines = []
388        current_line = bbox_lines[0]
389        for next_line in bbox_lines[1:]:
390            height = max(current_line.height, next_line.height)
391            # Calculate overlap via normalize origin (increasing with line index)
392            if ((current_line._sort_origin + rtol * height) >
393                (next_line._sort_origin - rtol * height)):
394                # if line.rotation or self.rotation:
395                #     # The next line overlaps this one, we merge the shorter line
396                #     # (typically super- and subscript) into taller line
397                #     use_current = len(current_line.chars) >= len(next_line.chars)
398                # else:
399                use_current = current_line.height >= next_line.height
400                line = current_line if use_current else next_line
401                current_line = CharLine(self, current_line.chars + next_line.chars,
402                                        line.bottom, line.origin, line.top,
403                                        height, line.rotation,
404                                        sort_origin=line._sort_origin)
405            else:
406                # The next line does not overlap the current line
407                merged_lines.append(current_line)
408                current_line = next_line
409        # append last line
410        merged_lines.append(current_line)
411
412        # Sort all lines horizontally based on character origin
413        sorted_lines = []
414        for line in merged_lines:
415            if line.rotation == 90:
416                def sort_key(char):
417                    if char.unicode in {0xa, 0xd}:
418                        return char.tbbox.midpoint.y - 1e9
419                    return char.tbbox.midpoint.y
420            elif line.rotation == 270:
421                def sort_key(char):
422                    if char.unicode in {0xa, 0xd}:
423                        return -char.tbbox.midpoint.y + 1e9
424                    return -char.tbbox.midpoint.y
425            else:
426                def sort_key(char):
427                    if char.unicode in {0xa, 0xd}:
428                        return char.origin.x + 1e9
429                    return char.origin.x
430            sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key),
431                                         line.bottom, line.origin,
432                                         line.top, line.height,
433                                         line.rotation, area.left,
434                                         sort_origin=line._sort_origin))
435
436        return sorted_lines
437
438    def _content_areas(self, area: Rectangle, with_graphics: bool = True) -> list:
439        if with_graphics:
440            graphics = self._graphics_filtered(area)
441            regions = []
442            for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
443                gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
444                for reg in regions:
445                    if reg.overlaps(gbbox.bottom, gbbox.top):
446                        # They overlap, so merge them
447                        reg.v0 = min(reg.v0, gbbox.bottom)
448                        reg.v1 = max(reg.v1, gbbox.top)
449                        reg.objs.append(graphic)
450                        break
451                else:
452                    regions.append(Region(gbbox.bottom, gbbox.top, graphic))
453
454            # print(regions)
455            areas = []
456            ypos = area.top
457            for reg in regions:
458                if ypos - reg.v1 > self._spacing["y_em"]:
459                    areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
460                for obj in reg.objs:
461                    oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
462                    areas.append((oarea, obj))
463                ypos = reg.v0
464            areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
465        else:
466            areas = [(area, None)]
467        return areas
468
469    def _objects_filtered(self, area: Rectangle, with_graphics: bool = True) -> list:
470        self._link_characters()
471        areas = self._content_areas(area, with_graphics)
472        objects = []
473        for narea, obj in areas:
474            if obj is None:
475                objects += self._charlines_filtered(narea)
476            else:
477                oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
478                predicate = lambda c: not obj.bbox.contains(c.origin)
479                lines = self._charlines_filtered(oarea, predicate)
480                # print(obj, oarea, lines, [line.content for line in lines])
481                objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
482        return objects
483
484    @property
485    def content_ast(self) -> list:
486        ast = []
487        with_graphics = True
488        if "DS" in self.pdf.name:
489            # FIXME: Terrible hack to get the ordering information table fixed
490            # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable
491            order_page = next((item.page_index for item in self.pdf.toc if item.level == 0 and
492                               re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1)
493            with_graphics = (order_page != self.index)
494        for area in self._areas["content"]:
495            ast.append(self._ast_filtered(area, with_graphics=with_graphics))
496        # Add a page node to the first leaf to keep track of where a page starts
497        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
498        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
499        return ast
500
501    def _graphics_filtered(self, area) -> list:
502        # Find all graphic clusters in this area
503        em = self._spacing["y_em"]
504        large_area = area.offset_x(em/2)
505        graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em/2)
506        # for bbox, paths in raw_graphic_clusters:
507        #     # Some docs have large DRAFT chars in the background
508        #     if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths):
509        #         continue
510        #     graphic_clusters.append((bbox, paths))
511
512        # Find the captions and group them by y origin to catch side-by-side figures
513        ycaptions = defaultdict(list)
514        for line in self._charlines_filtered(area, lambda c: "Bold" in c.font):
515            for cluster in line.clusters():
516                for phrase in [r"Figure \d+\.", r"Table \d+\."]:
517                    if re.match(phrase, cluster.content):
518                        ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars))
519        ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)]
520
521        # Now associate these captions with the graphics bboxes
522        categories = []
523        for captions in ycaptions:
524            width = area.width / len(captions)
525            for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)):
526                left, right = area.left + ii * width, area.left + (ii + 1) * width
527                bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height
528
529                # Find the graphic associated with this caption
530                graphic = next(((b, p) for b, p in graphic_clusters
531                                if b.bottom <= bottom and
532                                   left <= b.left and b.right <= right), None)
533                if graphic is None:
534                    LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}")
535                    continue
536
537                if self._template == "blue_gray":
538                    # Search for all lines of the current caption with the same properties
539                    cbbox = Rectangle(left, bottom, right, top)
540                    cchars = self.chars_in_area(cbbox)
541                    while True:
542                        nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top)
543                        nchars = self.chars_in_area(nbbox)
544                        if len(cchars) >= len(nchars):
545                            break
546                        cbbox = nbbox
547                        cchars = nchars
548                elif self._template == "black_white":
549                    cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top)
550
551                otype = phrase.split(" ")[0].lower()
552                if "Figure" in phrase:
553                    # Find all other graphics in the bounding box
554                    gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom)
555                    graphics = []
556                    for b, p in graphic_clusters:
557                        if gbbox.overlaps(b):
558                            graphics.append((b,p))
559                    for g in graphics:
560                        graphic_clusters.remove(g)
561                    gbbox = [cluster[0] for cluster in graphics]
562                    gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox)
563                    paths = [p for cluster in graphics for p in cluster[1]]
564
565                    if self._template == "blue_gray":
566                        # Search for characters below the graphics bbox, max 1 y_em
567                        gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom)
568                        while True:
569                            gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom)
570                            if not self.chars_in_area(gbbox):
571                                break
572                    # Generate the new bounding box which includes the caption
573                    gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom)
574                elif "Table" in phrase:
575                    graphic_clusters.remove(graphic)
576                    gbbox, paths = graphic
577                    if (self._template == "black_white" and
578                        sum(1 for path in paths if path.count == 2) >= len(paths) / 2):
579                        otype += "_lines"
580                categories.append((otype, cbbox, gbbox, paths))
581
582        # Deal with the remaining graphic categories
583        for gbbox, paths in graphic_clusters:
584            if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]:
585                continue
586            if any(isinstance(p, Image) for p in paths):
587                category = "figure"
588            elif self._template == "blue_gray":
589                if all(self._colors(path.stroke) == "gray" or
590                       self._colors(path.fill) == "darkblue" for path in paths):
591                    category = "table"
592                else:
593                    category = "figure"
594            elif self._template == "black_white":
595                # Some tables are rendered explicitly with filled rectangular
596                # shapes with others are implicitly rendered with stroked lines
597                stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2
598                is_table = stroked_table_lines or all(
599                    [any(p.isclose(pp) for pp in path.bbox.points)
600                     for p in path.points].count(True) >= len(path.points) * 2 / 3
601                    for path in paths)
602                if (len(paths) > 1 and is_table):
603                    category = "table"
604                    if stroked_table_lines:
605                        category += "_lines"
606                else:
607                    category = "figure"
608
609            if "table" in category:
610                # Check if there are only numbers on top of the table
611                cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"])
612                nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xa, 0xd}]
613
614                if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3:
615                    # This is a register table with invisible top borders!
616                    cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars))
617                    gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top)
618                    name = "register_" + category
619                else:
620                    cbbox = None
621                    name = category
622                categories.append((name, cbbox, gbbox, paths))
623            else:
624                categories.append(("figure", None, gbbox, paths))
625
626        # Convert the objects into specialized classes
627        categories.sort(key=lambda o: (-o[2].y, o[2].x))
628        objects = []
629        for otype, caption_bbox, graphics_bbox, graphics_paths in categories:
630            if "figure" in otype:
631                figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths)
632                objects.append(figure)
633            elif "table" in otype:
634                xlines, ylines, yhlines = [], [], []
635                for path in graphics_paths:
636                    if self._template == "blue_gray" or "_lines" in otype:
637                        if self._colors(path.stroke) == "gray" or "_lines" in otype:
638                            # Intercell paths in gray
639                            if len(path.lines) == 1:
640                                line = path.lines[0]
641                                if line.direction == line.Direction.VERTICAL:
642                                    xlines.append(line.specialize())
643                                elif line.direction == line.Direction.HORIZONTAL:
644                                    ylines.append(line.specialize())
645                                else:
646                                    LOGGER.warn(f"Line not vertical or horizontal: {line}")
647                            else:
648                                LOGGER.warn(f"Path too long: {path}")
649                        elif self._colors(path.fill) == "darkblue":
650                            # Add the bottom line of the dark blue header box as a very thick line
651                            line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5)
652                            yhlines.append(line)
653
654                    elif self._template == "black_white":
655                        bbox = path.bbox
656                        is_vertical = bbox.width < bbox.height
657                        width = bbox.width if is_vertical else bbox.height
658                        length = bbox.height if is_vertical else bbox.width
659                        if width <= self._spacing["x_em"] / 2:
660                            if length >= self._spacing["y_em"] / 2:
661                                if is_vertical:
662                                    line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width)
663                                    xlines.append(line)
664                                else:
665                                    line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height)
666                                    ylines.append(line)
667                        else:
668                            # Split the rectangle into it's outline
669                            xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1))
670                            xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1))
671                            ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1))
672                            ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1))
673                if yhlines:
674                    yhlines.sort(key=lambda l: l.p0.y)
675                    ylines.append(yhlines[0])
676                if not xlines or not ylines:
677                    continue
678                table = Table(self, graphics_bbox, xlines, ylines, caption_bbox,
679                              is_register="register" in otype)
680                objects.append(table)
681
682        return objects
683
684    @property
685    def content_objects(self) -> list:
686        objs = []
687        for area in self._areas["content"]:
688            objs.extend(self._objects_filtered(area))
689        return objs
690
691    @property
692    def content_graphics(self) -> list:
693        objs = []
694        for area in self._areas["content"]:
695            objs.extend(self._graphics_filtered(area))
696        return objs
697
698    @property
699    def content_lines(self) -> list:
700        return [o for o in self.content_objects if isinstance(o, CharLine)]
701
702    @property
703    def content_tables(self) -> list:
704        return [o for o in self.content_graphics if isinstance(o, Table)]
705
706    @property
707    def content_figures(self) -> list:
708        return [o for o in self.content_graphics if isinstance(o, Figure)]
709
710    def _char_properties(self, line, char):
711        cp = {
712            "superscript": False,
713            "subscript": False,
714            "bold": any(frag in char.font for frag in {"Bold"}),
715            "italic": any(frag in char.font for frag in {"Italic", "Oblique"}),
716            "underline": (char.objlink or char.weblink) is not None,
717            "size": round(line.height),
718            "relsize": self._line_size(line),
719            "char": chr(char.unicode),
720        }
721
722        if line.rotation:
723            if char.origin.x < (line.origin - 0.25 * line.height):
724                cp["superscript"] = True
725            elif char.origin.x > (line.origin + 0.15 * line.height):
726                cp["subscript"] = True
727        elif char.origin.y > (line.origin + 0.25 * line.height):
728            cp["superscript"] = True
729        elif char.origin.y < (line.origin - 0.15 * line.height):
730            cp["subscript"] = True
731
732        return cp
733
734    def _ast_filtered(self, area: Rectangle, with_graphics=True,
735                      ignore_xpos=False, with_bits=True, with_notes=True) -> list:
736        x_em = self._spacing["x_em"]
737        spacing_content = self._spacing["x_content"]
738        lh_factor = self._spacing["lh"]
739        # spacing_y = self._spacing["y_em"]
740        root = Node("area", obj=area, xpos=int(area.left), page=self)
741
742        def unindent(_xpos, _current, _newlines=1):
743            current = _current
744            # Check if we need to unindent the current node
745            while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos:
746                current = current.parent
747            if _newlines >= 2 and current.name == "para":
748                current = current.parent
749            return current
750
751        def parent_name(current):
752            return "" if current.parent is None else current.parent.name
753
754        current = root
755        ypos = area.top
756        for obj in self._objects_filtered(area, with_graphics):
757            xpos = round(obj.bbox.left)
758            # Tables should remain in their current hierarchy regardless of indentation
759            if isinstance(obj, (Table, Figure)):
760                current = next((c for c in current.iter_path_reverse()
761                                if c.name.startswith("head")), root)
762                name = "figure" if isinstance(obj, Figure) else "table"
763                Node(name, parent=current, obj=obj, xpos=xpos, number=-1,
764                     _width=obj.bbox.width / area.width, _type=obj._type)
765                ypos = obj.bbox.bottom
766            # Lines of text need to be carefully checked for indentation
767            elif isinstance(obj, CharLine):
768                newlines = round((ypos - obj.origin) / (lh_factor * obj.height))
769                content = obj.content
770                lcontent = content.lstrip()
771                content_start = 0
772                linesize = self._line_size(obj)
773
774                # Check when the note has finished (=> paragraphs without italic)
775                if (parent_name(current) == "note" and
776                    ((current.parent.type == "note" and not obj.contains_font(current.parent._font)) or
777                     (current.parent.type in {"caution", "warning"} and newlines >= 2))):
778                    current = current.parent.parent
779
780                # Check when the list ends into something indented far too right
781                elif (parent_name(current).startswith("list")
782                      and (xpos - current.xpos) >= 2 * x_em):
783                    current = current.parent.parent
784
785                # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content)
786                # Check if line is a heading, which may be multi-line, so we must
787                # be careful not to nest them, but group them properly
788                # Headings are always inserted into the root note!
789                if linesize.startswith("h1") or (linesize.startswith("h") and
790                        xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font):
791                    if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None:
792                        start = min(len(match.group(0)), len(obj.chars) - 1)
793                        marker = match.group(1)
794                        size = marker.count('.') + 2
795                    else:
796                        start = 0
797                        marker = None
798                        size = linesize[1]
799                    name = f"head{size}"
800                    # Check if we're already parsing a heading, do not split into two
801                    if parent_name(current) != name or newlines > 2:
802                        content_start = start
803                        xpos = round(obj.chars[content_start].bbox.left)
804                        current = Node(name, parent=root, obj=obj, xpos=xpos,
805                                       size=size, marker=marker)
806                        current = Node("para", parent=current, obj=obj, xpos=current.xpos)
807
808                # Check if the line is a note and deal with the indentation correctly
809                elif with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None:
810                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
811                    # print(obj.fonts)
812                    # Correct xposition only if the Note: string is very far left
813                    if xpos + 4 * x_em <= current.xpos:
814                        xpos = round(obj.chars[content_start].bbox.left)
815                    # Prevent nesting of notes, they should only be listed
816                    if parent_name(current) == "note":
817                        current =  current.parent.parent
818                    current = unindent(xpos, current, 2)
819                    current = Node("note", parent=current, obj=obj, xpos=xpos,
820                                   type=match.group(1).lower(), _font=obj.chars[content_start].font)
821                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
822
823                # Check if line is Table or Figure caption
824                elif with_graphics and ((match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None
825                      and "Bold" in obj.chars[0].font):
826                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
827                    current = next((c for c in current.iter_path_reverse()
828                                if c.name.startswith("head")), root)
829                    current = Node("caption", parent=current, obj=obj, xpos=xpos,
830                                   _type=match.group(1).lower(), number=int(match.group(2)))
831                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
832
833                # Check if line is list and group them according to indentation
834                elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None:
835                    current = unindent(xpos, current, newlines)
836                    content_start = len(match.group(0)) - 2
837                    xpos = round(obj.chars[content_start].bbox.left)
838                    name = "listb"
839                    value = lcontent[0]
840                    if value in {"–", "-"}: name = "lists"
841                    elif value.isalpha(): name = "lista"
842                    elif value.isnumeric():
843                        name = "listn"
844                        value = int(match.group(2))
845                    current = Node(name, parent=current, obj=obj, xpos=xpos, value=value)
846                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
847
848                # Check if line is a register bit definition
849                elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None:
850                    if obj.contains_font("Bold"):
851                        # Use the bold character as delimiter
852                        content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font)
853                    else:
854                        # Default back to the regex
855                        if "Reserved" not in content:
856                            LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}")
857                        content_start = re.match(r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content)
858                        if content_start is None:
859                            LOGGER.error(f"Unable to match Bit regex at all! '{content}'!")
860                            content_start = 0
861                        else:
862                            content_start = len(content_start.group(0))
863                        if not content_start:
864                            LOGGER.error(f"Missing content start (=0)! '{content}'!")
865                        content_start = min(content_start, len(obj.chars) - 1)
866
867                    current = next((c for c in current.iter_path_reverse()
868                                    if c.name.startswith("head")), root)
869                    middle = obj.chars[content_start].bbox.left
870                    xpos = round(middle)
871                    current = Node("bit", parent=current, obj=obj, xpos=xpos, _page=self,
872                                   _middle=middle, _left=area.left, _right=area.right)
873                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
874
875                # Check if this is a new paragraph
876                elif newlines >= 2 or current.name not in {"para"}:
877                    # Fix issues where notes are reflowing back left of Note: text
878                    if parent_name(current) in {"note"}:
879                        if xpos < current.parent.xpos:
880                            xpos = current.parent.xpos
881                    # Prevent multiline
882                    current = unindent(xpos, current, newlines)
883                    current = Node("para", parent=current, obj=obj,
884                                   xpos=xpos if current.is_root else current.xpos)
885
886                elif (parent_name(current) not in {"caption", "bit", "area"}):
887                    current = unindent(xpos, current, newlines)
888
889                # Add the actual line
890                Node("line", parent=current, obj=obj, xpos=xpos,
891                     start=content_start, str=content[content_start:50])
892
893                ypos = obj.origin
894
895        return root
896
897    def __repr__(self) -> str:
898        return f"StPage({self.number})"

LOGGER = <Logger modm_data.pdf2html.stmicro.page (WARNING)>

def is_compatible(document) -> bool: View Source

22def is_compatible(document) -> bool:
23    if "stmicro" in document.metadata.get("Author", "").lower():
24        return True
25    return False

def areas_black_white(page) -> dict: View Source

28def areas_black_white(page) -> dict:
29    def _scale(r):
30        if page.rotation:
31            return Rectangle(r.bottom * page.width, (1 - r.right) * page.height,
32                             r.top * page.width, (1 - r.left) * page.height)
33        return Rectangle(r.left * page.width, r.bottom * page.height,
34                         r.right * page.width, r.top * page.height)
35
36    bottom_left = Rectangle(0.1, 0.1, 0.3, 0.12)
37    bottom_middle = Rectangle(0.3, 0.1, 0.7, 0.12)
38    bottom_right = Rectangle(0.7, 0.1, 0.9, 0.12)
39    top = Rectangle(0.1, 0.9125, 0.9, 0.9375)
40    content = Rectangle(0.025, 0.12, 0.975, 0.905 if page.index else 0.79)
41    all_content = [content]
42    areas = {
43        # Bottom string in the middle: Example "RM0410 Rev 4"
44        "id": bottom_middle,
45    }
46    if page.index == 0:
47        # Publish date on the bottom left on first page
48        areas["date"] = bottom_left
49        # number on the bottom right on first page
50        areas["number"] = bottom_right
51        # Add top areas
52        all_content.insert(0, Rectangle(0.375, 0.855, 0.975, 0.9125))
53        all_content.insert(1, Rectangle(0.025, 0.805, 0.975, 0.855))
54    else:
55        # Page number on bottom
56        areas["number"] = bottom_left if page.index % 2 else bottom_right
57        # Chapter name on top
58        areas["top"] = top
59
60    # Recognize the two column design of the Datasheets with a big table underneath
61    if page.index < 3 and "DS" in page.pdf.name:
62        # Find a wide path that would denote the beginning of a table
63        top_rect = [p.bbox.top / page.height for p in page.paths
64                    if _scale(content).contains(p.bbox) and p.bbox.width > page.width * 0.75]
65        if top_rect:
66            # offset for table label just above it
67            ybottom = max(*top_rect) + 0.0175
68        else:
69            ybottom = content.bottom
70        # Try to find list or sublists in these areas
71        mr = Rectangle(0.49, ybottom, 0.51, content.top)
72        br = Rectangle(0.51, ybottom, 0.5325, content.top)
73        hr = Rectangle(0.5325, ybottom, 0.555, content.top)
74        text_middle = page.text_in_area(_scale(mr))
75        text_bullets = page.text_in_area(_scale(br))
76        text_hyphens = page.text_in_area(_scale(hr))
77        if (not text_middle and
78            (any(c in text_bullets for c in {"•", chr(61623)}) or
79             any(c in text_hyphens for c in {"-"}))):
80            areas["middle_bullets"] = br
81            areas["middle_hyphens"] = hr
82            all_content = all_content[:-1]
83            all_content.append(Rectangle(content.left, ybottom, 0.5, content.top))
84            all_content.append(Rectangle(0.505, ybottom, content.right, content.top))
85            if top_rect:
86                all_content.append(Rectangle(content.left, content.bottom, content.right, ybottom))
87
88    areas["content"] = all_content
89    scaled_areas = {}
90    for name, area in areas.items():
91        if isinstance(area, list):
92            scaled_areas[name] = [_scale(r) for r in area]
93        else:
94            scaled_areas[name] = _scale(area)
95    return scaled_areas

def areas_blue_gray(page) -> dict: View Source

 98def areas_blue_gray(page) -> dict:
 99    def _scale(r):
100        return Rectangle(r.left * page.width, r.bottom * page.height,
101                         r.right * page.width, r.top * page.height)
102
103    # This template doesn't use rotated pages, instead uses
104    # hardcoded rotated page dimensions
105    if page.width > page.height:
106        content = Rectangle(0.05, 0.025, 0.89, 0.975)
107        bottom_left = Rectangle(0, 0.6, 0.05, 1)
108        top_right = Rectangle(0.9025, 0.05, 0.9175, 0.7)
109    else:
110        content = Rectangle(0.025, 0.05, 0.975, 0.89 if page.index else 0.81)
111        bottom_left = Rectangle(0, 0, 0.4, 0.05)
112        top_right = Rectangle(0.3, 0.9025, 0.95, 0.9175)
113    areas = {
114        "id": bottom_left,
115        "top": top_right,
116        "all_content": content,
117        "content": []
118    }
119    if page.index == 0:
120        areas["content"] = [
121            # Document device string
122            Rectangle(0.4, 0.91, 0.95, 0.95),
123            # Document description string
124            Rectangle(0.05, 0.81, 0.95, 0.86)
125        ]
126    if page.index < 10:
127        # Contains only a table with product summary
128        br = Rectangle(0.35, content.bottom, 0.37, content.top)
129        text_bullets = page.text_in_area(_scale(br))
130        if any(c in text_bullets for c in {"•", chr(61623)}):
131            areas["middle_bullets"] = br
132            # Contains the actual content here
133            left = Rectangle(content.left, content.bottom, 0.3565, content.top)
134            right = Rectangle(0.3565, content.bottom, content.right, content.top)
135            areas["content"].extend([left, right])
136        else:
137            areas["content"] = [content]
138    else:
139        areas["content"] = [content]
140
141    scaled_areas = {}
142    for name, area in areas.items():
143        if isinstance(area, list):
144            scaled_areas[name] = [_scale(r) for r in area]
145        else:
146            scaled_areas[name] = _scale(area)
147    return scaled_areas

def spacing_black_white(page) -> dict: View Source

150def spacing_black_white(page) -> dict:
151    content = 0.1125
152    spacing = {
153        # Horizontal spacing: left->right
154        "x_em": 0.01 * page.width,
155        "x_left": content * page.width,
156        "x_right": (1 - content) * page.width,
157        "x_content": 0.2075 * page.width,
158        # Vertical spacing: bottom->top
159        "y_em": 0.01 * page.height,
160        # Max table line thickness
161        "y_tline": 0.005 * page.height,
162        # Max line height distance to detect paragraphs
163        "lh": 0.9,
164        # Max line height distance to detect super-/subscript
165        "sc": 0.325,
166        # Table header cell bold text threshold
167        "th": 0.33,
168    }
169    if page.rotation:
170        content = 0.14
171        spacing.update({
172            "x_em": 0.01 * page.height,
173            "y_em": 0.01 * page.width,
174            "x_left": content * page.width,
175            "x_right": (1 - content) * page.width,
176            "x_content": 0.2075 * page.width,
177            "y_tline": 0.005 * page.width,
178            "lh": 1.2,
179            "sc": 0.4,
180        })
181    return spacing

def spacing_blue_gray(page) -> dict: View Source

184def spacing_blue_gray(page) -> dict:
185    content = 0.07
186    spacing = {
187        # Horizontal spacing: left->right
188        "x_em": 0.01 * page.width,
189        "x_left": content * page.width,
190        "x_right": (1 - content) * page.width,
191        "x_content": 0.165 * page.width,
192        # Vertical spacing: bottom->top
193        "y_em": 0.01 * page.height,
194        # Max table line thickness
195        "y_tline": 0.005 * page.height,
196        # Max line height distance to detect paragraphs
197        "lh": 0.9,
198        # Max line height distance to detect super-/subscript
199        "sc": 0.3,
200        # Table header cell bold text threshold
201        "th": 0.33,
202    }
203    if page.rotation:
204        spacing.update({
205            "x_em": 0.01 * page.height,
206            "y_em": 0.01 * page.width,
207            "x_left": 0.05 * page.width,
208            "x_right": (1 - 0.16) * page.width,
209            "x_content": 0.2075 * page.width,
210            "y_tline": 0.005 * page.width,
211            "lh": 1.6,
212            "sc": 0.2,
213        })
214    return spacing

def linesize_black_white(line: float) -> str: View Source

217def linesize_black_white(line: float) -> str:
218    rsize = line.height
219    if rsize >= 17.5: return "h1"
220    elif rsize >= 15.5: return "h2"
221    elif rsize >= 13.5: return "h3"
222    elif rsize >= 11.4: return "h4"
223    elif rsize >= 8.5: return "n"
224    else: return "fn"

def linesize_blue_gray(line: float) -> str: View Source

227def linesize_blue_gray(line: float) -> str:
228    rsize = round(line.height)
229    if rsize >= 16: return "h1"
230    elif rsize >= 14: return "h2"
231    elif rsize >= 12: return "h3"
232    elif rsize >= 10: return "h4"
233    elif rsize >= 7: return "n"
234    else: return "fn"

def colors_black_white(color: int) -> str: View Source

237def colors_black_white(color: int) -> str:
238    if 0xff <= color <= 0xff:
239        return "black"
240    if 0xffffffff <= color <= 0xffffffff:
241        return "white"
242    return "unknown"

def colors_blue_gray(color: int) -> str: View Source

245def colors_blue_gray(color: int) -> str:
246    if 0xff <= color <= 0xff:
247        return "black"
248    if 0xffffffff <= color <= 0xffffffff:
249        return "white"
250    if 0xb9c4caff <= color <= 0xb9c4caff:
251        return "gray"
252    if 0x1f81afff <= color <= 0x1f81afff:
253        return "lightblue"
254    if 0x2052ff <= color <= 0x2052ff:
255        return "darkblue"
256    if 0x39a9dcff <= color <= 0x39a9dcff:
257        return "blue"
258    return "unknown"

modm_data.pdf2html.stmicro.page

Parameters

Inherited Members