modm_data.pdf2html.stmicro

1# Copyright 2022, Niklas Hauser
2# SPDX-License-Identifier: MPL-2.0
3
4
5from .document import Document
6from .page import Page
7
8__all__ = ["Document", "Page"]
class Document(modm_data.pdf.document.Document):
40class Document(PdfDocument):
41    def __init__(self, path: str):
42        super().__init__(path)
43        self._normalize = _normalize_document
44
45    def page(self, index: int) -> StmPage:
46        assert index < self.page_count
47        return StmPage(self, index)
48
49    def __repr__(self) -> str:
50        return f"STMicroDoc({self.name})"

This class is a convenience wrapper with caching around the high-level APIs of pypdfium.

Document(path: str)
41    def __init__(self, path: str):
42        super().__init__(path)
43        self._normalize = _normalize_document
Parameters
  • path: Path to the PDF to open.
def page(self, index: int) -> Page:
45    def page(self, index: int) -> StmPage:
46        assert index < self.page_count
47        return StmPage(self, index)
Parameters
  • index: 0-indexed page number.
Returns

the page object for the index.

Inherited Members
modm_data.pdf.document.Document
name
metadata
destinations
toc
identifier_permanent
identifier_changing
page_count
pages
pypdfium2._helpers.document.PdfDocument
formenv
parent
new
init_forms
get_formtype
get_pagemode
is_tagged
save
get_identifier
get_version
get_metadata_value
METADATA_KEYS
get_metadata_dict
count_attachments
get_attachment
new_attachment
del_attachment
get_page
new_page
del_page
import_pages
get_page_size
get_page_label
page_as_xobject
get_toc
render
pypdfium2.internal.bases.AutoCloseable
close
class Page(modm_data.pdf2html.page.Page):
291class Page(BasePage):
292    def __init__(self, document, index: int):
293        super().__init__(document, index)
294        producer = self.pdf.metadata.get("Producer", "").lower()
295        self._template = "black_white"
296        if "acrobat" in producer or "adobe" in producer:
297            pass
298        elif "antenna" in producer:
299            self._template = "blue_gray"
300        else:
301            _LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
302
303        if "blue_gray" in self._template:
304            self._areas = _areas_blue_gray(self)
305            self._spacing = _spacing_blue_gray(self)
306            self._colors = _colors_blue_gray
307            self._line_size = _linesize_blue_gray
308        elif "black_white" in self._template:
309            self._areas = _areas_black_white(self)
310            self._spacing = _spacing_black_white(self)
311            self._colors = _colors_black_white
312            self._line_size = _linesize_black_white
313
314    def _unicode_filter(self, code: int) -> int:
315        # Ignore Carriage Return characters and ® (superscript issues)
316        if code in {0xD, ord("®")}:
317            return None
318        # Correct some weird unicode stuffing choices
319        if code in {2}:
320            return ord("-")
321        if code in {61623, 61664}:
322            return ord("•")
323        return code
324
325    @cached_property
326    def identifier(self) -> str:
327        return self.text_in_named_area("id", check_length=False)
328
329    @cached_property
330    def top(self) -> str:
331        if self.index == 0:
332            return "Cover"
333        return self.text_in_named_area("top", check_length=False)
334
335    @cached_property
336    def is_relevant(self) -> bool:
337        if any(c in self.top for c in {"Contents", "List of ", "Index"}):
338            return False
339        return True
340
341    @property
342    def content_ast(self) -> list:
343        ast = []
344        with_graphics = True
345        if "DS" in self.pdf.name:
346            # FIXME: Terrible hack to get the ordering information table fixed
347            # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable
348            order_page = next(
349                (
350                    item.page_index
351                    for item in self.pdf.toc
352                    if item.level == 0 and re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)
353                ),
354                -1,
355            )
356            with_graphics = order_page != self.index
357        for area in self._areas["content"]:
358            ast.append(self.ast_in_area(area, with_graphics=with_graphics))
359        # Add a page node to the first leaf to keep track of where a page starts
360        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
361        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
362        return ast
363
364    def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
365        # Find all graphic clusters in this area
366        em = self._spacing["y_em"]
367        large_area = area.offset_x(em / 2)
368        graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em / 2)
369        # for bbox, paths in raw_graphic_clusters:
370        #     # Some docs have large DRAFT chars in the background
371        #     if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths):
372        #         continue
373        #     graphic_clusters.append((bbox, paths))
374
375        # Find the captions and group them by y origin to catch side-by-side figures
376        ycaptions = defaultdict(list)
377        for line in self.charlines_in_area(area, lambda c: "Bold" in c.font):
378            for cluster in line.clusters():
379                for phrase in [r"Figure \d+\.", r"Table \d+\."]:
380                    if re.match(phrase, cluster.content):
381                        ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars))
382        ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)]
383
384        # Now associate these captions with the graphics bboxes
385        categories = []
386        for captions in ycaptions:
387            width = area.width / len(captions)
388            for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)):
389                left, right = area.left + ii * width, area.left + (ii + 1) * width
390                bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height
391
392                # Find the graphic associated with this caption
393                graphic = next(
394                    ((b, p) for b, p in graphic_clusters if b.bottom <= bottom and left <= b.left and b.right <= right),
395                    None,
396                )
397                if graphic is None:
398                    _LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}")
399                    continue
400
401                if self._template == "blue_gray":
402                    # Search for all lines of the current caption with the same properties
403                    cbbox = Rectangle(left, bottom, right, top)
404                    cchars = self.chars_in_area(cbbox)
405                    while True:
406                        nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top)
407                        nchars = self.chars_in_area(nbbox)
408                        if len(cchars) >= len(nchars):
409                            break
410                        cbbox = nbbox
411                        cchars = nchars
412                else:
413                    cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top)
414
415                otype = phrase.split(" ")[0].lower()
416                if "Figure" in phrase:
417                    # Find all other graphics in the bounding box
418                    gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom)
419                    graphics = []
420                    for b, p in graphic_clusters:
421                        if gbbox.overlaps(b):
422                            graphics.append((b, p))
423                    for g in graphics:
424                        graphic_clusters.remove(g)
425                    gbbox = [cluster[0] for cluster in graphics]
426                    gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox)
427                    paths = [p for cluster in graphics for p in cluster[1]]
428
429                    if self._template == "blue_gray":
430                        # Search for characters below the graphics bbox, max 1 y_em
431                        gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom)
432                        while True:
433                            gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom)
434                            if not self.chars_in_area(gbbox):
435                                break
436                    # Generate the new bounding box which includes the caption
437                    gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom)
438                elif "Table" in phrase:
439                    graphic_clusters.remove(graphic)
440                    gbbox, paths = graphic
441                    if (
442                        self._template == "black_white"
443                        and sum(1 for path in paths if path.count == 2) >= len(paths) / 2
444                    ):
445                        otype += "_lines"
446                categories.append((otype, cbbox, gbbox, paths))
447
448        # Deal with the remaining graphic categories
449        for gbbox, paths in graphic_clusters:
450            if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]:
451                continue
452            category = ""
453            if any(isinstance(p, Image) for p in paths):
454                category = "figure"
455            elif self._template == "blue_gray":
456                if all(self._colors(path.stroke) == "gray" or self._colors(path.fill) == "darkblue" for path in paths):
457                    category = "table"
458                else:
459                    category = "figure"
460            elif self._template == "black_white":
461                # Some tables are rendered explicitly with filled rectangular
462                # shapes with others are implicitly rendered with stroked lines
463                stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2
464                is_table = stroked_table_lines or all(
465                    [any(p.isclose(pp) for pp in path.bbox.points) for p in path.points].count(True)
466                    >= len(path.points) * 2 / 3
467                    for path in paths
468                )
469                if len(paths) > 1 and is_table:
470                    category = "table"
471                    if stroked_table_lines:
472                        category += "_lines"
473                else:
474                    category = "figure"
475
476            if "table" in category:
477                # Check if there are only numbers on top of the table
478                cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"])
479                nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xA, 0xD}]
480
481                if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3:
482                    # This is a register table with invisible top borders!
483                    cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars))
484                    gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top)
485                    name = "register_" + category
486                else:
487                    cbbox = None
488                    name = category
489                categories.append((name, cbbox, gbbox, paths))
490            else:
491                categories.append(("figure", None, gbbox, paths))
492
493        # Convert the objects into specialized classes
494        categories.sort(key=lambda o: (-o[2].y, o[2].x))
495        objects = []
496        for otype, caption_bbox, graphics_bbox, graphics_paths in categories:
497            if "figure" in otype:
498                figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths)
499                objects.append(figure)
500            elif "table" in otype:
501                xlines, ylines, yhlines = [], [], []
502                for path in graphics_paths:
503                    if self._template == "blue_gray" or "_lines" in otype:
504                        if self._colors(path.stroke) == "gray" or "_lines" in otype:
505                            # Intercell paths in gray
506                            if len(path.lines) == 1:
507                                line = path.lines[0]
508                                if line.direction == line.Direction.VERTICAL:
509                                    xlines.append(line.specialize())
510                                elif line.direction == line.Direction.HORIZONTAL:
511                                    ylines.append(line.specialize())
512                                else:
513                                    _LOGGER.warn(f"Line not vertical or horizontal: {line}")
514                            else:
515                                _LOGGER.warn(f"Path too long: {path}")
516                        elif self._colors(path.fill) == "darkblue":
517                            # Add the bottom line of the dark blue header box as a very thick line
518                            line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5)
519                            yhlines.append(line)
520
521                    elif self._template == "black_white":
522                        bbox = path.bbox
523                        is_vertical = bbox.width < bbox.height
524                        width = bbox.width if is_vertical else bbox.height
525                        length = bbox.height if is_vertical else bbox.width
526                        if width <= self._spacing["x_em"] / 2:
527                            if length >= self._spacing["y_em"] / 2:
528                                if is_vertical:
529                                    line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width)
530                                    xlines.append(line)
531                                else:
532                                    line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height)
533                                    ylines.append(line)
534                        else:
535                            # Split the rectangle into it's outline
536                            xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1))
537                            xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1))
538                            ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1))
539                            ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1))
540                if yhlines:
541                    yhlines.sort(key=lambda line: line.p0.y)
542                    ylines.append(yhlines[0])
543                if not xlines or not ylines:
544                    continue
545                table = Table(self, graphics_bbox, xlines, ylines, caption_bbox, is_register="register" in otype)
546                objects.append(table)
547
548        return objects
549
550    def ast_in_area(
551        self,
552        area: Rectangle,
553        with_graphics: bool = True,
554        ignore_xpos: bool = False,
555        with_bits: bool = True,
556        with_notes: bool = True,
557    ) -> Node:
558        x_em = self._spacing["x_em"]
559        spacing_content = self._spacing["x_content"]
560        lh_factor = self._spacing["lh"]
561        # spacing_y = self._spacing["y_em"]
562        root = Node("area", obj=area, xpos=int(area.left), page=self)
563
564        def unindent(_xpos, _current, _newlines=1):
565            current = _current
566            # Check if we need to unindent the current node
567            while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos:
568                current = current.parent
569            if _newlines >= 2 and current.name == "para":
570                current = current.parent
571            return current
572
573        def parent_name(current):
574            return "" if current.parent is None else current.parent.name
575
576        current = root
577        ypos = area.top
578        for obj in self.objects_in_area(area, with_graphics):
579            xpos = round(obj.bbox.left)
580
581            # Tables should remain in their current hierarchy regardless of indentation
582            if isinstance(obj, (Table, Figure)):
583                current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root)
584                name = "figure" if isinstance(obj, Figure) else "table"
585                Node(
586                    name,
587                    parent=current,
588                    obj=obj,
589                    xpos=xpos,
590                    number=-1,
591                    _width=obj.bbox.width / area.width,
592                    _type=obj._type,
593                )
594                ypos = obj.bbox.bottom
595
596            # Lines of text need to be carefully checked for indentation
597            elif isinstance(obj, CharLine):
598                newlines = round((ypos - obj.origin) / (lh_factor * obj.height))
599                content = obj.content
600                lcontent = content.lstrip()
601                content_start = 0
602                linesize = self._line_size(obj)
603
604                # Check when the note has finished (=> paragraphs without italic)
605                if parent_name(current) == "note" and (
606                    (current.parent.type == "note" and not obj.contains_font(current.parent._font))
607                    or (current.parent.type in {"caution", "warning"} and newlines >= 2)
608                ):
609                    current = current.parent.parent
610
611                # Check when the list ends into something indented far too right
612                elif parent_name(current).startswith("list") and (xpos - current.xpos) >= 2 * x_em:
613                    current = current.parent.parent
614
615                # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content)
616
617                # Check if line is a heading, which may be multi-line, so we must
618                # be careful not to nest them, but group them properly
619                # Headings are always inserted into the root note!
620                if linesize.startswith("h1") or (
621                    linesize.startswith("h") and xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font
622                ):
623                    if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None:
624                        start = min(len(match.group(0)), len(obj.chars) - 1)
625                        marker = match.group(1)
626                        size = marker.count(".") + 2
627                    else:
628                        start = 0
629                        marker = None
630                        size = linesize[1]
631                    name = f"head{size}"
632                    # Check if we're already parsing a heading, do not split into two
633                    if parent_name(current) != name or newlines > 2:
634                        content_start = start
635                        xpos = round(obj.chars[content_start].bbox.left)
636                        current = Node(name, parent=root, obj=obj, xpos=xpos, size=size, marker=marker)
637                        current = Node("para", parent=current, obj=obj, xpos=current.xpos)
638
639                # Check if the line is a note and deal with the indentation correctly
640                elif (
641                    with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None
642                ):
643                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
644                    # print(obj.fonts)
645                    # Correct xposition only if the Note: string is very far left
646                    if xpos + 4 * x_em <= current.xpos:
647                        xpos = round(obj.chars[content_start].bbox.left)
648                    # Prevent nesting of notes, they should only be listed
649                    if parent_name(current) == "note":
650                        current = current.parent.parent
651                    current = unindent(xpos, current, 2)
652                    current = Node(
653                        "note",
654                        parent=current,
655                        obj=obj,
656                        xpos=xpos,
657                        type=match.group(1).lower(),
658                        _font=obj.chars[content_start].font,
659                    )
660                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
661
662                # Check if line is Table or Figure caption
663                elif with_graphics and (
664                    (match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None
665                    and "Bold" in obj.chars[0].font
666                ):
667                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
668                    current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root)
669                    current = Node(
670                        "caption",
671                        parent=current,
672                        obj=obj,
673                        xpos=xpos,
674                        _type=match.group(1).lower(),
675                        number=int(match.group(2)),
676                    )
677                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
678
679                # Check if line is list and group them according to indentation
680                elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None:
681                    current = unindent(xpos, current, newlines)
682                    content_start = len(match.group(0)) - 2
683                    xpos = round(obj.chars[content_start].bbox.left)
684                    name = "listb"
685                    value = lcontent[0]
686                    if value in {"–", "-"}:
687                        name = "lists"
688                    elif value.isalpha():
689                        name = "lista"
690                    elif value.isnumeric():
691                        name = "listn"
692                        value = int(match.group(2))
693                    current = Node(name, parent=current, obj=obj, xpos=xpos, value=value)
694                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
695
696                # Check if line is a register bit definition
697                elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None:
698                    if obj.contains_font("Bold"):
699                        # Use the bold character as delimiter
700                        content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font)
701                    else:
702                        # Default back to the regex
703                        if "Reserved" not in content:
704                            _LOGGER.warning(
705                                f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}"
706                            )
707                        content_start = re.match(
708                            r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content
709                        )
710                        if content_start is None:
711                            _LOGGER.error(f"Unable to match Bit regex at all! '{content}'!")
712                            content_start = 0
713                        else:
714                            content_start = len(content_start.group(0))
715                        if not content_start:
716                            _LOGGER.error(f"Missing content start (=0)! '{content}'!")
717                        content_start = min(content_start, len(obj.chars) - 1)
718
719                    current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root)
720                    middle = obj.chars[content_start].bbox.left
721                    xpos = round(middle)
722                    current = Node(
723                        "bit",
724                        parent=current,
725                        obj=obj,
726                        xpos=xpos,
727                        _page=self,
728                        _middle=middle,
729                        _left=area.left,
730                        _right=area.right,
731                    )
732                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
733
734                # Check if this is a new paragraph
735                elif newlines >= 2 or current.name not in {"para"}:
736                    # Fix issues where notes are reflowing back left of Note: text
737                    if parent_name(current) in {"note"}:
738                        if xpos < current.parent.xpos:
739                            xpos = current.parent.xpos
740                    # Prevent multiline
741                    current = unindent(xpos, current, newlines)
742                    current = Node("para", parent=current, obj=obj, xpos=xpos if current.is_root else current.xpos)
743
744                elif parent_name(current) not in {"caption", "bit", "area"}:
745                    current = unindent(xpos, current, newlines)
746
747                # Add the actual line
748                Node("line", parent=current, obj=obj, xpos=xpos, start=content_start, str=content[content_start:50])
749
750                ypos = obj.origin
751
752        return root
753
754    def __repr__(self) -> str:
755        return f"StmPage({self.number})"

This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.

Page(document, index: int)
292    def __init__(self, document, index: int):
293        super().__init__(document, index)
294        producer = self.pdf.metadata.get("Producer", "").lower()
295        self._template = "black_white"
296        if "acrobat" in producer or "adobe" in producer:
297            pass
298        elif "antenna" in producer:
299            self._template = "blue_gray"
300        else:
301            _LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
302
303        if "blue_gray" in self._template:
304            self._areas = _areas_blue_gray(self)
305            self._spacing = _spacing_blue_gray(self)
306            self._colors = _colors_blue_gray
307            self._line_size = _linesize_blue_gray
308        elif "black_white" in self._template:
309            self._areas = _areas_black_white(self)
310            self._spacing = _spacing_black_white(self)
311            self._colors = _colors_black_white
312            self._line_size = _linesize_black_white
Parameters
  • document: a PDF document.
  • index: 0-index page number.
identifier: str
325    @cached_property
326    def identifier(self) -> str:
327        return self.text_in_named_area("id", check_length=False)
top: str
329    @cached_property
330    def top(self) -> str:
331        if self.index == 0:
332            return "Cover"
333        return self.text_in_named_area("top", check_length=False)
is_relevant: bool
335    @cached_property
336    def is_relevant(self) -> bool:
337        if any(c in self.top for c in {"Contents", "List of ", "Index"}):
338            return False
339        return True

Is this page relevant for the conversion?

content_ast: list
341    @property
342    def content_ast(self) -> list:
343        ast = []
344        with_graphics = True
345        if "DS" in self.pdf.name:
346            # FIXME: Terrible hack to get the ordering information table fixed
347            # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable
348            order_page = next(
349                (
350                    item.page_index
351                    for item in self.pdf.toc
352                    if item.level == 0 and re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)
353                ),
354                -1,
355            )
356            with_graphics = order_page != self.index
357        for area in self._areas["content"]:
358            ast.append(self.ast_in_area(area, with_graphics=with_graphics))
359        # Add a page node to the first leaf to keep track of where a page starts
360        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
361        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
362        return ast

The abstract syntax trees in the content area.

def graphics_in_area( self, area: modm_data.utils.Rectangle) -> list[modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure]:
364    def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
365        # Find all graphic clusters in this area
366        em = self._spacing["y_em"]
367        large_area = area.offset_x(em / 2)
368        graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em / 2)
369        # for bbox, paths in raw_graphic_clusters:
370        #     # Some docs have large DRAFT chars in the background
371        #     if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths):
372        #         continue
373        #     graphic_clusters.append((bbox, paths))
374
375        # Find the captions and group them by y origin to catch side-by-side figures
376        ycaptions = defaultdict(list)
377        for line in self.charlines_in_area(area, lambda c: "Bold" in c.font):
378            for cluster in line.clusters():
379                for phrase in [r"Figure \d+\.", r"Table \d+\."]:
380                    if re.match(phrase, cluster.content):
381                        ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars))
382        ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)]
383
384        # Now associate these captions with the graphics bboxes
385        categories = []
386        for captions in ycaptions:
387            width = area.width / len(captions)
388            for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)):
389                left, right = area.left + ii * width, area.left + (ii + 1) * width
390                bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height
391
392                # Find the graphic associated with this caption
393                graphic = next(
394                    ((b, p) for b, p in graphic_clusters if b.bottom <= bottom and left <= b.left and b.right <= right),
395                    None,
396                )
397                if graphic is None:
398                    _LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}")
399                    continue
400
401                if self._template == "blue_gray":
402                    # Search for all lines of the current caption with the same properties
403                    cbbox = Rectangle(left, bottom, right, top)
404                    cchars = self.chars_in_area(cbbox)
405                    while True:
406                        nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top)
407                        nchars = self.chars_in_area(nbbox)
408                        if len(cchars) >= len(nchars):
409                            break
410                        cbbox = nbbox
411                        cchars = nchars
412                else:
413                    cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top)
414
415                otype = phrase.split(" ")[0].lower()
416                if "Figure" in phrase:
417                    # Find all other graphics in the bounding box
418                    gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom)
419                    graphics = []
420                    for b, p in graphic_clusters:
421                        if gbbox.overlaps(b):
422                            graphics.append((b, p))
423                    for g in graphics:
424                        graphic_clusters.remove(g)
425                    gbbox = [cluster[0] for cluster in graphics]
426                    gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox)
427                    paths = [p for cluster in graphics for p in cluster[1]]
428
429                    if self._template == "blue_gray":
430                        # Search for characters below the graphics bbox, max 1 y_em
431                        gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom)
432                        while True:
433                            gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom)
434                            if not self.chars_in_area(gbbox):
435                                break
436                    # Generate the new bounding box which includes the caption
437                    gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom)
438                elif "Table" in phrase:
439                    graphic_clusters.remove(graphic)
440                    gbbox, paths = graphic
441                    if (
442                        self._template == "black_white"
443                        and sum(1 for path in paths if path.count == 2) >= len(paths) / 2
444                    ):
445                        otype += "_lines"
446                categories.append((otype, cbbox, gbbox, paths))
447
448        # Deal with the remaining graphic categories
449        for gbbox, paths in graphic_clusters:
450            if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]:
451                continue
452            category = ""
453            if any(isinstance(p, Image) for p in paths):
454                category = "figure"
455            elif self._template == "blue_gray":
456                if all(self._colors(path.stroke) == "gray" or self._colors(path.fill) == "darkblue" for path in paths):
457                    category = "table"
458                else:
459                    category = "figure"
460            elif self._template == "black_white":
461                # Some tables are rendered explicitly with filled rectangular
462                # shapes with others are implicitly rendered with stroked lines
463                stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2
464                is_table = stroked_table_lines or all(
465                    [any(p.isclose(pp) for pp in path.bbox.points) for p in path.points].count(True)
466                    >= len(path.points) * 2 / 3
467                    for path in paths
468                )
469                if len(paths) > 1 and is_table:
470                    category = "table"
471                    if stroked_table_lines:
472                        category += "_lines"
473                else:
474                    category = "figure"
475
476            if "table" in category:
477                # Check if there are only numbers on top of the table
478                cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"])
479                nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xA, 0xD}]
480
481                if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3:
482                    # This is a register table with invisible top borders!
483                    cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars))
484                    gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top)
485                    name = "register_" + category
486                else:
487                    cbbox = None
488                    name = category
489                categories.append((name, cbbox, gbbox, paths))
490            else:
491                categories.append(("figure", None, gbbox, paths))
492
493        # Convert the objects into specialized classes
494        categories.sort(key=lambda o: (-o[2].y, o[2].x))
495        objects = []
496        for otype, caption_bbox, graphics_bbox, graphics_paths in categories:
497            if "figure" in otype:
498                figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths)
499                objects.append(figure)
500            elif "table" in otype:
501                xlines, ylines, yhlines = [], [], []
502                for path in graphics_paths:
503                    if self._template == "blue_gray" or "_lines" in otype:
504                        if self._colors(path.stroke) == "gray" or "_lines" in otype:
505                            # Intercell paths in gray
506                            if len(path.lines) == 1:
507                                line = path.lines[0]
508                                if line.direction == line.Direction.VERTICAL:
509                                    xlines.append(line.specialize())
510                                elif line.direction == line.Direction.HORIZONTAL:
511                                    ylines.append(line.specialize())
512                                else:
513                                    _LOGGER.warn(f"Line not vertical or horizontal: {line}")
514                            else:
515                                _LOGGER.warn(f"Path too long: {path}")
516                        elif self._colors(path.fill) == "darkblue":
517                            # Add the bottom line of the dark blue header box as a very thick line
518                            line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5)
519                            yhlines.append(line)
520
521                    elif self._template == "black_white":
522                        bbox = path.bbox
523                        is_vertical = bbox.width < bbox.height
524                        width = bbox.width if is_vertical else bbox.height
525                        length = bbox.height if is_vertical else bbox.width
526                        if width <= self._spacing["x_em"] / 2:
527                            if length >= self._spacing["y_em"] / 2:
528                                if is_vertical:
529                                    line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width)
530                                    xlines.append(line)
531                                else:
532                                    line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height)
533                                    ylines.append(line)
534                        else:
535                            # Split the rectangle into it's outline
536                            xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1))
537                            xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1))
538                            ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1))
539                            ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1))
540                if yhlines:
541                    yhlines.sort(key=lambda line: line.p0.y)
542                    ylines.append(yhlines[0])
543                if not xlines or not ylines:
544                    continue
545                table = Table(self, graphics_bbox, xlines, ylines, caption_bbox, is_register="register" in otype)
546                objects.append(table)
547
548        return objects

Find all tables and figures in this area.

Parameters
  • area: area to search for graphics.
Returns

list of tables and figures.

def ast_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True, ignore_xpos: bool = False, with_bits: bool = True, with_notes: bool = True) -> anytree.node.node.Node:
550    def ast_in_area(
551        self,
552        area: Rectangle,
553        with_graphics: bool = True,
554        ignore_xpos: bool = False,
555        with_bits: bool = True,
556        with_notes: bool = True,
557    ) -> Node:
558        x_em = self._spacing["x_em"]
559        spacing_content = self._spacing["x_content"]
560        lh_factor = self._spacing["lh"]
561        # spacing_y = self._spacing["y_em"]
562        root = Node("area", obj=area, xpos=int(area.left), page=self)
563
564        def unindent(_xpos, _current, _newlines=1):
565            current = _current
566            # Check if we need to unindent the current node
567            while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos:
568                current = current.parent
569            if _newlines >= 2 and current.name == "para":
570                current = current.parent
571            return current
572
573        def parent_name(current):
574            return "" if current.parent is None else current.parent.name
575
576        current = root
577        ypos = area.top
578        for obj in self.objects_in_area(area, with_graphics):
579            xpos = round(obj.bbox.left)
580
581            # Tables should remain in their current hierarchy regardless of indentation
582            if isinstance(obj, (Table, Figure)):
583                current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root)
584                name = "figure" if isinstance(obj, Figure) else "table"
585                Node(
586                    name,
587                    parent=current,
588                    obj=obj,
589                    xpos=xpos,
590                    number=-1,
591                    _width=obj.bbox.width / area.width,
592                    _type=obj._type,
593                )
594                ypos = obj.bbox.bottom
595
596            # Lines of text need to be carefully checked for indentation
597            elif isinstance(obj, CharLine):
598                newlines = round((ypos - obj.origin) / (lh_factor * obj.height))
599                content = obj.content
600                lcontent = content.lstrip()
601                content_start = 0
602                linesize = self._line_size(obj)
603
604                # Check when the note has finished (=> paragraphs without italic)
605                if parent_name(current) == "note" and (
606                    (current.parent.type == "note" and not obj.contains_font(current.parent._font))
607                    or (current.parent.type in {"caution", "warning"} and newlines >= 2)
608                ):
609                    current = current.parent.parent
610
611                # Check when the list ends into something indented far too right
612                elif parent_name(current).startswith("list") and (xpos - current.xpos) >= 2 * x_em:
613                    current = current.parent.parent
614
615                # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content)
616
617                # Check if line is a heading, which may be multi-line, so we must
618                # be careful not to nest them, but group them properly
619                # Headings are always inserted into the root note!
620                if linesize.startswith("h1") or (
621                    linesize.startswith("h") and xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font
622                ):
623                    if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None:
624                        start = min(len(match.group(0)), len(obj.chars) - 1)
625                        marker = match.group(1)
626                        size = marker.count(".") + 2
627                    else:
628                        start = 0
629                        marker = None
630                        size = linesize[1]
631                    name = f"head{size}"
632                    # Check if we're already parsing a heading, do not split into two
633                    if parent_name(current) != name or newlines > 2:
634                        content_start = start
635                        xpos = round(obj.chars[content_start].bbox.left)
636                        current = Node(name, parent=root, obj=obj, xpos=xpos, size=size, marker=marker)
637                        current = Node("para", parent=current, obj=obj, xpos=current.xpos)
638
639                # Check if the line is a note and deal with the indentation correctly
640                elif (
641                    with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None
642                ):
643                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
644                    # print(obj.fonts)
645                    # Correct xposition only if the Note: string is very far left
646                    if xpos + 4 * x_em <= current.xpos:
647                        xpos = round(obj.chars[content_start].bbox.left)
648                    # Prevent nesting of notes, they should only be listed
649                    if parent_name(current) == "note":
650                        current = current.parent.parent
651                    current = unindent(xpos, current, 2)
652                    current = Node(
653                        "note",
654                        parent=current,
655                        obj=obj,
656                        xpos=xpos,
657                        type=match.group(1).lower(),
658                        _font=obj.chars[content_start].font,
659                    )
660                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
661
662                # Check if line is Table or Figure caption
663                elif with_graphics and (
664                    (match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None
665                    and "Bold" in obj.chars[0].font
666                ):
667                    content_start = min(len(match.group(0)), len(obj.chars) - 1)
668                    current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root)
669                    current = Node(
670                        "caption",
671                        parent=current,
672                        obj=obj,
673                        xpos=xpos,
674                        _type=match.group(1).lower(),
675                        number=int(match.group(2)),
676                    )
677                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
678
679                # Check if line is list and group them according to indentation
680                elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None:
681                    current = unindent(xpos, current, newlines)
682                    content_start = len(match.group(0)) - 2
683                    xpos = round(obj.chars[content_start].bbox.left)
684                    name = "listb"
685                    value = lcontent[0]
686                    if value in {"–", "-"}:
687                        name = "lists"
688                    elif value.isalpha():
689                        name = "lista"
690                    elif value.isnumeric():
691                        name = "listn"
692                        value = int(match.group(2))
693                    current = Node(name, parent=current, obj=obj, xpos=xpos, value=value)
694                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
695
696                # Check if line is a register bit definition
697                elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None:
698                    if obj.contains_font("Bold"):
699                        # Use the bold character as delimiter
700                        content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font)
701                    else:
702                        # Default back to the regex
703                        if "Reserved" not in content:
704                            _LOGGER.warning(
705                                f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}"
706                            )
707                        content_start = re.match(
708                            r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content
709                        )
710                        if content_start is None:
711                            _LOGGER.error(f"Unable to match Bit regex at all! '{content}'!")
712                            content_start = 0
713                        else:
714                            content_start = len(content_start.group(0))
715                        if not content_start:
716                            _LOGGER.error(f"Missing content start (=0)! '{content}'!")
717                        content_start = min(content_start, len(obj.chars) - 1)
718
719                    current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root)
720                    middle = obj.chars[content_start].bbox.left
721                    xpos = round(middle)
722                    current = Node(
723                        "bit",
724                        parent=current,
725                        obj=obj,
726                        xpos=xpos,
727                        _page=self,
728                        _middle=middle,
729                        _left=area.left,
730                        _right=area.right,
731                    )
732                    current = Node("para", parent=current, obj=obj, xpos=current.xpos)
733
734                # Check if this is a new paragraph
735                elif newlines >= 2 or current.name not in {"para"}:
736                    # Fix issues where notes are reflowing back left of Note: text
737                    if parent_name(current) in {"note"}:
738                        if xpos < current.parent.xpos:
739                            xpos = current.parent.xpos
740                    # Prevent multiline
741                    current = unindent(xpos, current, newlines)
742                    current = Node("para", parent=current, obj=obj, xpos=xpos if current.is_root else current.xpos)
743
744                elif parent_name(current) not in {"caption", "bit", "area"}:
745                    current = unindent(xpos, current, newlines)
746
747                # Add the actual line
748                Node("line", parent=current, obj=obj, xpos=xpos, start=content_start, str=content[content_start:50])
749
750                ypos = obj.origin
751
752        return root

Convert the area content into an abstract syntax tree.

Parameters
  • area: area to search for content.
  • with_graphics: including graphics in the area.
Returns

An abstract syntax tree including the content formatting.

Inherited Members
modm_data.pdf2html.page.Page
text_in_named_area
charlines_in_area
graphic_bboxes_in_area
objects_in_area
content_objects
content_graphics
content_lines
content_tables
content_figures
modm_data.pdf.page.Page
index
number
label
width
height
rotation
bbox
char_count
char
chars
chars_in_area
text_in_area
structures
find
paths
images
graphic_clusters
pypdfium2._helpers.page.PdfPage
parent
get_width
get_height
get_size
get_rotation
set_rotation
get_mediabox
set_mediabox
get_cropbox
set_cropbox
get_bleedbox
set_bleedbox
get_trimbox
set_trimbox
get_artbox
set_artbox
get_bbox
get_textpage
insert_obj
remove_obj
gen_content
get_objects
render
pypdfium2.internal.bases.AutoCloseable
close