modm_data.pdf2html.stmicro
class
Document(modm_data.pdf.document.Document):
40class Document(PdfDocument): 41 def __init__(self, path: str): 42 super().__init__(path) 43 self._normalize = _normalize_document 44 45 def page(self, index: int) -> StmPage: 46 assert index < self.page_count 47 return StmPage(self, index) 48 49 def __repr__(self) -> str: 50 return f"STMicroDoc({self.name})"
This class is a convenience wrapper with caching around the high-level APIs of pypdfium.
Document(path: str)
41 def __init__(self, path: str): 42 super().__init__(path) 43 self._normalize = _normalize_document
Parameters
- path: Path to the PDF to open.
45 def page(self, index: int) -> StmPage: 46 assert index < self.page_count 47 return StmPage(self, index)
Parameters
- index: 0-indexed page number.
Returns
the page object for the index.
Inherited Members
- modm_data.pdf.document.Document
- name
- metadata
- destinations
- toc
- identifier_permanent
- identifier_changing
- page_count
- pages
- pypdfium2._helpers.document.PdfDocument
- formenv
- parent
- new
- init_forms
- get_formtype
- get_pagemode
- is_tagged
- save
- get_identifier
- get_version
- get_metadata_value
- METADATA_KEYS
- get_metadata_dict
- count_attachments
- get_attachment
- new_attachment
- del_attachment
- get_page
- new_page
- del_page
- import_pages
- get_page_size
- get_page_label
- page_as_xobject
- get_toc
- render
- pypdfium2.internal.bases.AutoCloseable
- close
291class Page(BasePage): 292 def __init__(self, document, index: int): 293 super().__init__(document, index) 294 producer = self.pdf.metadata.get("Producer", "").lower() 295 self._template = "black_white" 296 if "acrobat" in producer or "adobe" in producer: 297 pass 298 elif "antenna" in producer: 299 self._template = "blue_gray" 300 else: 301 _LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") 302 303 if "blue_gray" in self._template: 304 self._areas = _areas_blue_gray(self) 305 self._spacing = _spacing_blue_gray(self) 306 self._colors = _colors_blue_gray 307 self._line_size = _linesize_blue_gray 308 elif "black_white" in self._template: 309 self._areas = _areas_black_white(self) 310 self._spacing = _spacing_black_white(self) 311 self._colors = _colors_black_white 312 self._line_size = _linesize_black_white 313 314 def _unicode_filter(self, code: int) -> int: 315 # Ignore Carriage Return characters and ® (superscript issues) 316 if code in {0xD, ord("®")}: 317 return None 318 # Correct some weird unicode stuffing choices 319 if code in {2}: 320 return ord("-") 321 if code in {61623, 61664}: 322 return ord("•") 323 return code 324 325 @cached_property 326 def identifier(self) -> str: 327 return self.text_in_named_area("id", check_length=False) 328 329 @cached_property 330 def top(self) -> str: 331 if self.index == 0: 332 return "Cover" 333 return self.text_in_named_area("top", check_length=False) 334 335 @cached_property 336 def is_relevant(self) -> bool: 337 if any(c in self.top for c in {"Contents", "List of ", "Index"}): 338 return False 339 return True 340 341 @property 342 def content_ast(self) -> list: 343 ast = [] 344 with_graphics = True 345 if "DS" in self.pdf.name: 346 # FIXME: Terrible hack to get the ordering information table fixed 347 # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable 348 order_page = next( 349 ( 350 item.page_index 351 for item in self.pdf.toc 352 if item.level == 0 and re.search("ordering +information|part +numbering", item.title, re.IGNORECASE) 353 ), 354 -1, 355 ) 356 with_graphics = order_page != self.index 357 for area in self._areas["content"]: 358 ast.append(self.ast_in_area(area, with_graphics=with_graphics)) 359 # Add a page node to the first leaf to keep track of where a page starts 360 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 361 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 362 return ast 363 364 def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: 365 # Find all graphic clusters in this area 366 em = self._spacing["y_em"] 367 large_area = area.offset_x(em / 2) 368 graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em / 2) 369 # for bbox, paths in raw_graphic_clusters: 370 # # Some docs have large DRAFT chars in the background 371 # if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths): 372 # continue 373 # graphic_clusters.append((bbox, paths)) 374 375 # Find the captions and group them by y origin to catch side-by-side figures 376 ycaptions = defaultdict(list) 377 for line in self.charlines_in_area(area, lambda c: "Bold" in c.font): 378 for cluster in line.clusters(): 379 for phrase in [r"Figure \d+\.", r"Table \d+\."]: 380 if re.match(phrase, cluster.content): 381 ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars)) 382 ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)] 383 384 # Now associate these captions with the graphics bboxes 385 categories = [] 386 for captions in ycaptions: 387 width = area.width / len(captions) 388 for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)): 389 left, right = area.left + ii * width, area.left + (ii + 1) * width 390 bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height 391 392 # Find the graphic associated with this caption 393 graphic = next( 394 ((b, p) for b, p in graphic_clusters if b.bottom <= bottom and left <= b.left and b.right <= right), 395 None, 396 ) 397 if graphic is None: 398 _LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}") 399 continue 400 401 if self._template == "blue_gray": 402 # Search for all lines of the current caption with the same properties 403 cbbox = Rectangle(left, bottom, right, top) 404 cchars = self.chars_in_area(cbbox) 405 while True: 406 nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top) 407 nchars = self.chars_in_area(nbbox) 408 if len(cchars) >= len(nchars): 409 break 410 cbbox = nbbox 411 cchars = nchars 412 else: 413 cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top) 414 415 otype = phrase.split(" ")[0].lower() 416 if "Figure" in phrase: 417 # Find all other graphics in the bounding box 418 gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom) 419 graphics = [] 420 for b, p in graphic_clusters: 421 if gbbox.overlaps(b): 422 graphics.append((b, p)) 423 for g in graphics: 424 graphic_clusters.remove(g) 425 gbbox = [cluster[0] for cluster in graphics] 426 gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox) 427 paths = [p for cluster in graphics for p in cluster[1]] 428 429 if self._template == "blue_gray": 430 # Search for characters below the graphics bbox, max 1 y_em 431 gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom) 432 while True: 433 gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom) 434 if not self.chars_in_area(gbbox): 435 break 436 # Generate the new bounding box which includes the caption 437 gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom) 438 elif "Table" in phrase: 439 graphic_clusters.remove(graphic) 440 gbbox, paths = graphic 441 if ( 442 self._template == "black_white" 443 and sum(1 for path in paths if path.count == 2) >= len(paths) / 2 444 ): 445 otype += "_lines" 446 categories.append((otype, cbbox, gbbox, paths)) 447 448 # Deal with the remaining graphic categories 449 for gbbox, paths in graphic_clusters: 450 if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]: 451 continue 452 category = "" 453 if any(isinstance(p, Image) for p in paths): 454 category = "figure" 455 elif self._template == "blue_gray": 456 if all(self._colors(path.stroke) == "gray" or self._colors(path.fill) == "darkblue" for path in paths): 457 category = "table" 458 else: 459 category = "figure" 460 elif self._template == "black_white": 461 # Some tables are rendered explicitly with filled rectangular 462 # shapes with others are implicitly rendered with stroked lines 463 stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2 464 is_table = stroked_table_lines or all( 465 [any(p.isclose(pp) for pp in path.bbox.points) for p in path.points].count(True) 466 >= len(path.points) * 2 / 3 467 for path in paths 468 ) 469 if len(paths) > 1 and is_table: 470 category = "table" 471 if stroked_table_lines: 472 category += "_lines" 473 else: 474 category = "figure" 475 476 if "table" in category: 477 # Check if there are only numbers on top of the table 478 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"]) 479 nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xA, 0xD}] 480 481 if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3: 482 # This is a register table with invisible top borders! 483 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars)) 484 gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top) 485 name = "register_" + category 486 else: 487 cbbox = None 488 name = category 489 categories.append((name, cbbox, gbbox, paths)) 490 else: 491 categories.append(("figure", None, gbbox, paths)) 492 493 # Convert the objects into specialized classes 494 categories.sort(key=lambda o: (-o[2].y, o[2].x)) 495 objects = [] 496 for otype, caption_bbox, graphics_bbox, graphics_paths in categories: 497 if "figure" in otype: 498 figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths) 499 objects.append(figure) 500 elif "table" in otype: 501 xlines, ylines, yhlines = [], [], [] 502 for path in graphics_paths: 503 if self._template == "blue_gray" or "_lines" in otype: 504 if self._colors(path.stroke) == "gray" or "_lines" in otype: 505 # Intercell paths in gray 506 if len(path.lines) == 1: 507 line = path.lines[0] 508 if line.direction == line.Direction.VERTICAL: 509 xlines.append(line.specialize()) 510 elif line.direction == line.Direction.HORIZONTAL: 511 ylines.append(line.specialize()) 512 else: 513 _LOGGER.warn(f"Line not vertical or horizontal: {line}") 514 else: 515 _LOGGER.warn(f"Path too long: {path}") 516 elif self._colors(path.fill) == "darkblue": 517 # Add the bottom line of the dark blue header box as a very thick line 518 line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5) 519 yhlines.append(line) 520 521 elif self._template == "black_white": 522 bbox = path.bbox 523 is_vertical = bbox.width < bbox.height 524 width = bbox.width if is_vertical else bbox.height 525 length = bbox.height if is_vertical else bbox.width 526 if width <= self._spacing["x_em"] / 2: 527 if length >= self._spacing["y_em"] / 2: 528 if is_vertical: 529 line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width) 530 xlines.append(line) 531 else: 532 line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height) 533 ylines.append(line) 534 else: 535 # Split the rectangle into it's outline 536 xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1)) 537 xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1)) 538 ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1)) 539 ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1)) 540 if yhlines: 541 yhlines.sort(key=lambda line: line.p0.y) 542 ylines.append(yhlines[0]) 543 if not xlines or not ylines: 544 continue 545 table = Table(self, graphics_bbox, xlines, ylines, caption_bbox, is_register="register" in otype) 546 objects.append(table) 547 548 return objects 549 550 def ast_in_area( 551 self, 552 area: Rectangle, 553 with_graphics: bool = True, 554 ignore_xpos: bool = False, 555 with_bits: bool = True, 556 with_notes: bool = True, 557 ) -> Node: 558 x_em = self._spacing["x_em"] 559 spacing_content = self._spacing["x_content"] 560 lh_factor = self._spacing["lh"] 561 # spacing_y = self._spacing["y_em"] 562 root = Node("area", obj=area, xpos=int(area.left), page=self) 563 564 def unindent(_xpos, _current, _newlines=1): 565 current = _current 566 # Check if we need to unindent the current node 567 while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos: 568 current = current.parent 569 if _newlines >= 2 and current.name == "para": 570 current = current.parent 571 return current 572 573 def parent_name(current): 574 return "" if current.parent is None else current.parent.name 575 576 current = root 577 ypos = area.top 578 for obj in self.objects_in_area(area, with_graphics): 579 xpos = round(obj.bbox.left) 580 581 # Tables should remain in their current hierarchy regardless of indentation 582 if isinstance(obj, (Table, Figure)): 583 current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) 584 name = "figure" if isinstance(obj, Figure) else "table" 585 Node( 586 name, 587 parent=current, 588 obj=obj, 589 xpos=xpos, 590 number=-1, 591 _width=obj.bbox.width / area.width, 592 _type=obj._type, 593 ) 594 ypos = obj.bbox.bottom 595 596 # Lines of text need to be carefully checked for indentation 597 elif isinstance(obj, CharLine): 598 newlines = round((ypos - obj.origin) / (lh_factor * obj.height)) 599 content = obj.content 600 lcontent = content.lstrip() 601 content_start = 0 602 linesize = self._line_size(obj) 603 604 # Check when the note has finished (=> paragraphs without italic) 605 if parent_name(current) == "note" and ( 606 (current.parent.type == "note" and not obj.contains_font(current.parent._font)) 607 or (current.parent.type in {"caution", "warning"} and newlines >= 2) 608 ): 609 current = current.parent.parent 610 611 # Check when the list ends into something indented far too right 612 elif parent_name(current).startswith("list") and (xpos - current.xpos) >= 2 * x_em: 613 current = current.parent.parent 614 615 # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content) 616 617 # Check if line is a heading, which may be multi-line, so we must 618 # be careful not to nest them, but group them properly 619 # Headings are always inserted into the root note! 620 if linesize.startswith("h1") or ( 621 linesize.startswith("h") and xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font 622 ): 623 if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None: 624 start = min(len(match.group(0)), len(obj.chars) - 1) 625 marker = match.group(1) 626 size = marker.count(".") + 2 627 else: 628 start = 0 629 marker = None 630 size = linesize[1] 631 name = f"head{size}" 632 # Check if we're already parsing a heading, do not split into two 633 if parent_name(current) != name or newlines > 2: 634 content_start = start 635 xpos = round(obj.chars[content_start].bbox.left) 636 current = Node(name, parent=root, obj=obj, xpos=xpos, size=size, marker=marker) 637 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 638 639 # Check if the line is a note and deal with the indentation correctly 640 elif ( 641 with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None 642 ): 643 content_start = min(len(match.group(0)), len(obj.chars) - 1) 644 # print(obj.fonts) 645 # Correct xposition only if the Note: string is very far left 646 if xpos + 4 * x_em <= current.xpos: 647 xpos = round(obj.chars[content_start].bbox.left) 648 # Prevent nesting of notes, they should only be listed 649 if parent_name(current) == "note": 650 current = current.parent.parent 651 current = unindent(xpos, current, 2) 652 current = Node( 653 "note", 654 parent=current, 655 obj=obj, 656 xpos=xpos, 657 type=match.group(1).lower(), 658 _font=obj.chars[content_start].font, 659 ) 660 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 661 662 # Check if line is Table or Figure caption 663 elif with_graphics and ( 664 (match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None 665 and "Bold" in obj.chars[0].font 666 ): 667 content_start = min(len(match.group(0)), len(obj.chars) - 1) 668 current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) 669 current = Node( 670 "caption", 671 parent=current, 672 obj=obj, 673 xpos=xpos, 674 _type=match.group(1).lower(), 675 number=int(match.group(2)), 676 ) 677 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 678 679 # Check if line is list and group them according to indentation 680 elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None: 681 current = unindent(xpos, current, newlines) 682 content_start = len(match.group(0)) - 2 683 xpos = round(obj.chars[content_start].bbox.left) 684 name = "listb" 685 value = lcontent[0] 686 if value in {"–", "-"}: 687 name = "lists" 688 elif value.isalpha(): 689 name = "lista" 690 elif value.isnumeric(): 691 name = "listn" 692 value = int(match.group(2)) 693 current = Node(name, parent=current, obj=obj, xpos=xpos, value=value) 694 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 695 696 # Check if line is a register bit definition 697 elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None: 698 if obj.contains_font("Bold"): 699 # Use the bold character as delimiter 700 content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font) 701 else: 702 # Default back to the regex 703 if "Reserved" not in content: 704 _LOGGER.warning( 705 f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}" 706 ) 707 content_start = re.match( 708 r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content 709 ) 710 if content_start is None: 711 _LOGGER.error(f"Unable to match Bit regex at all! '{content}'!") 712 content_start = 0 713 else: 714 content_start = len(content_start.group(0)) 715 if not content_start: 716 _LOGGER.error(f"Missing content start (=0)! '{content}'!") 717 content_start = min(content_start, len(obj.chars) - 1) 718 719 current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) 720 middle = obj.chars[content_start].bbox.left 721 xpos = round(middle) 722 current = Node( 723 "bit", 724 parent=current, 725 obj=obj, 726 xpos=xpos, 727 _page=self, 728 _middle=middle, 729 _left=area.left, 730 _right=area.right, 731 ) 732 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 733 734 # Check if this is a new paragraph 735 elif newlines >= 2 or current.name not in {"para"}: 736 # Fix issues where notes are reflowing back left of Note: text 737 if parent_name(current) in {"note"}: 738 if xpos < current.parent.xpos: 739 xpos = current.parent.xpos 740 # Prevent multiline 741 current = unindent(xpos, current, newlines) 742 current = Node("para", parent=current, obj=obj, xpos=xpos if current.is_root else current.xpos) 743 744 elif parent_name(current) not in {"caption", "bit", "area"}: 745 current = unindent(xpos, current, newlines) 746 747 # Add the actual line 748 Node("line", parent=current, obj=obj, xpos=xpos, start=content_start, str=content[content_start:50]) 749 750 ypos = obj.origin 751 752 return root 753 754 def __repr__(self) -> str: 755 return f"StmPage({self.number})"
This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.
Page(document, index: int)
292 def __init__(self, document, index: int): 293 super().__init__(document, index) 294 producer = self.pdf.metadata.get("Producer", "").lower() 295 self._template = "black_white" 296 if "acrobat" in producer or "adobe" in producer: 297 pass 298 elif "antenna" in producer: 299 self._template = "blue_gray" 300 else: 301 _LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") 302 303 if "blue_gray" in self._template: 304 self._areas = _areas_blue_gray(self) 305 self._spacing = _spacing_blue_gray(self) 306 self._colors = _colors_blue_gray 307 self._line_size = _linesize_blue_gray 308 elif "black_white" in self._template: 309 self._areas = _areas_black_white(self) 310 self._spacing = _spacing_black_white(self) 311 self._colors = _colors_black_white 312 self._line_size = _linesize_black_white
Parameters
- document: a PDF document.
- index: 0-index page number.
is_relevant: bool
335 @cached_property 336 def is_relevant(self) -> bool: 337 if any(c in self.top for c in {"Contents", "List of ", "Index"}): 338 return False 339 return True
Is this page relevant for the conversion?
content_ast: list
341 @property 342 def content_ast(self) -> list: 343 ast = [] 344 with_graphics = True 345 if "DS" in self.pdf.name: 346 # FIXME: Terrible hack to get the ordering information table fixed 347 # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable 348 order_page = next( 349 ( 350 item.page_index 351 for item in self.pdf.toc 352 if item.level == 0 and re.search("ordering +information|part +numbering", item.title, re.IGNORECASE) 353 ), 354 -1, 355 ) 356 with_graphics = order_page != self.index 357 for area in self._areas["content"]: 358 ast.append(self.ast_in_area(area, with_graphics=with_graphics)) 359 # Add a page node to the first leaf to keep track of where a page starts 360 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 361 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 362 return ast
The abstract syntax trees in the content area.
def
graphics_in_area( self, area: modm_data.utils.Rectangle) -> list[modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure]:
364 def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: 365 # Find all graphic clusters in this area 366 em = self._spacing["y_em"] 367 large_area = area.offset_x(em / 2) 368 graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em / 2) 369 # for bbox, paths in raw_graphic_clusters: 370 # # Some docs have large DRAFT chars in the background 371 # if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths): 372 # continue 373 # graphic_clusters.append((bbox, paths)) 374 375 # Find the captions and group them by y origin to catch side-by-side figures 376 ycaptions = defaultdict(list) 377 for line in self.charlines_in_area(area, lambda c: "Bold" in c.font): 378 for cluster in line.clusters(): 379 for phrase in [r"Figure \d+\.", r"Table \d+\."]: 380 if re.match(phrase, cluster.content): 381 ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars)) 382 ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)] 383 384 # Now associate these captions with the graphics bboxes 385 categories = [] 386 for captions in ycaptions: 387 width = area.width / len(captions) 388 for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)): 389 left, right = area.left + ii * width, area.left + (ii + 1) * width 390 bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height 391 392 # Find the graphic associated with this caption 393 graphic = next( 394 ((b, p) for b, p in graphic_clusters if b.bottom <= bottom and left <= b.left and b.right <= right), 395 None, 396 ) 397 if graphic is None: 398 _LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}") 399 continue 400 401 if self._template == "blue_gray": 402 # Search for all lines of the current caption with the same properties 403 cbbox = Rectangle(left, bottom, right, top) 404 cchars = self.chars_in_area(cbbox) 405 while True: 406 nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top) 407 nchars = self.chars_in_area(nbbox) 408 if len(cchars) >= len(nchars): 409 break 410 cbbox = nbbox 411 cchars = nchars 412 else: 413 cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top) 414 415 otype = phrase.split(" ")[0].lower() 416 if "Figure" in phrase: 417 # Find all other graphics in the bounding box 418 gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom) 419 graphics = [] 420 for b, p in graphic_clusters: 421 if gbbox.overlaps(b): 422 graphics.append((b, p)) 423 for g in graphics: 424 graphic_clusters.remove(g) 425 gbbox = [cluster[0] for cluster in graphics] 426 gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox) 427 paths = [p for cluster in graphics for p in cluster[1]] 428 429 if self._template == "blue_gray": 430 # Search for characters below the graphics bbox, max 1 y_em 431 gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom) 432 while True: 433 gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom) 434 if not self.chars_in_area(gbbox): 435 break 436 # Generate the new bounding box which includes the caption 437 gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom) 438 elif "Table" in phrase: 439 graphic_clusters.remove(graphic) 440 gbbox, paths = graphic 441 if ( 442 self._template == "black_white" 443 and sum(1 for path in paths if path.count == 2) >= len(paths) / 2 444 ): 445 otype += "_lines" 446 categories.append((otype, cbbox, gbbox, paths)) 447 448 # Deal with the remaining graphic categories 449 for gbbox, paths in graphic_clusters: 450 if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]: 451 continue 452 category = "" 453 if any(isinstance(p, Image) for p in paths): 454 category = "figure" 455 elif self._template == "blue_gray": 456 if all(self._colors(path.stroke) == "gray" or self._colors(path.fill) == "darkblue" for path in paths): 457 category = "table" 458 else: 459 category = "figure" 460 elif self._template == "black_white": 461 # Some tables are rendered explicitly with filled rectangular 462 # shapes with others are implicitly rendered with stroked lines 463 stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2 464 is_table = stroked_table_lines or all( 465 [any(p.isclose(pp) for pp in path.bbox.points) for p in path.points].count(True) 466 >= len(path.points) * 2 / 3 467 for path in paths 468 ) 469 if len(paths) > 1 and is_table: 470 category = "table" 471 if stroked_table_lines: 472 category += "_lines" 473 else: 474 category = "figure" 475 476 if "table" in category: 477 # Check if there are only numbers on top of the table 478 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"]) 479 nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xA, 0xD}] 480 481 if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3: 482 # This is a register table with invisible top borders! 483 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars)) 484 gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top) 485 name = "register_" + category 486 else: 487 cbbox = None 488 name = category 489 categories.append((name, cbbox, gbbox, paths)) 490 else: 491 categories.append(("figure", None, gbbox, paths)) 492 493 # Convert the objects into specialized classes 494 categories.sort(key=lambda o: (-o[2].y, o[2].x)) 495 objects = [] 496 for otype, caption_bbox, graphics_bbox, graphics_paths in categories: 497 if "figure" in otype: 498 figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths) 499 objects.append(figure) 500 elif "table" in otype: 501 xlines, ylines, yhlines = [], [], [] 502 for path in graphics_paths: 503 if self._template == "blue_gray" or "_lines" in otype: 504 if self._colors(path.stroke) == "gray" or "_lines" in otype: 505 # Intercell paths in gray 506 if len(path.lines) == 1: 507 line = path.lines[0] 508 if line.direction == line.Direction.VERTICAL: 509 xlines.append(line.specialize()) 510 elif line.direction == line.Direction.HORIZONTAL: 511 ylines.append(line.specialize()) 512 else: 513 _LOGGER.warn(f"Line not vertical or horizontal: {line}") 514 else: 515 _LOGGER.warn(f"Path too long: {path}") 516 elif self._colors(path.fill) == "darkblue": 517 # Add the bottom line of the dark blue header box as a very thick line 518 line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5) 519 yhlines.append(line) 520 521 elif self._template == "black_white": 522 bbox = path.bbox 523 is_vertical = bbox.width < bbox.height 524 width = bbox.width if is_vertical else bbox.height 525 length = bbox.height if is_vertical else bbox.width 526 if width <= self._spacing["x_em"] / 2: 527 if length >= self._spacing["y_em"] / 2: 528 if is_vertical: 529 line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width) 530 xlines.append(line) 531 else: 532 line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height) 533 ylines.append(line) 534 else: 535 # Split the rectangle into it's outline 536 xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1)) 537 xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1)) 538 ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1)) 539 ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1)) 540 if yhlines: 541 yhlines.sort(key=lambda line: line.p0.y) 542 ylines.append(yhlines[0]) 543 if not xlines or not ylines: 544 continue 545 table = Table(self, graphics_bbox, xlines, ylines, caption_bbox, is_register="register" in otype) 546 objects.append(table) 547 548 return objects
Find all tables and figures in this area.
Parameters
- area: area to search for graphics.
Returns
list of tables and figures.
def
ast_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True, ignore_xpos: bool = False, with_bits: bool = True, with_notes: bool = True) -> anytree.node.node.Node:
550 def ast_in_area( 551 self, 552 area: Rectangle, 553 with_graphics: bool = True, 554 ignore_xpos: bool = False, 555 with_bits: bool = True, 556 with_notes: bool = True, 557 ) -> Node: 558 x_em = self._spacing["x_em"] 559 spacing_content = self._spacing["x_content"] 560 lh_factor = self._spacing["lh"] 561 # spacing_y = self._spacing["y_em"] 562 root = Node("area", obj=area, xpos=int(area.left), page=self) 563 564 def unindent(_xpos, _current, _newlines=1): 565 current = _current 566 # Check if we need to unindent the current node 567 while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos: 568 current = current.parent 569 if _newlines >= 2 and current.name == "para": 570 current = current.parent 571 return current 572 573 def parent_name(current): 574 return "" if current.parent is None else current.parent.name 575 576 current = root 577 ypos = area.top 578 for obj in self.objects_in_area(area, with_graphics): 579 xpos = round(obj.bbox.left) 580 581 # Tables should remain in their current hierarchy regardless of indentation 582 if isinstance(obj, (Table, Figure)): 583 current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) 584 name = "figure" if isinstance(obj, Figure) else "table" 585 Node( 586 name, 587 parent=current, 588 obj=obj, 589 xpos=xpos, 590 number=-1, 591 _width=obj.bbox.width / area.width, 592 _type=obj._type, 593 ) 594 ypos = obj.bbox.bottom 595 596 # Lines of text need to be carefully checked for indentation 597 elif isinstance(obj, CharLine): 598 newlines = round((ypos - obj.origin) / (lh_factor * obj.height)) 599 content = obj.content 600 lcontent = content.lstrip() 601 content_start = 0 602 linesize = self._line_size(obj) 603 604 # Check when the note has finished (=> paragraphs without italic) 605 if parent_name(current) == "note" and ( 606 (current.parent.type == "note" and not obj.contains_font(current.parent._font)) 607 or (current.parent.type in {"caution", "warning"} and newlines >= 2) 608 ): 609 current = current.parent.parent 610 611 # Check when the list ends into something indented far too right 612 elif parent_name(current).startswith("list") and (xpos - current.xpos) >= 2 * x_em: 613 current = current.parent.parent 614 615 # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content) 616 617 # Check if line is a heading, which may be multi-line, so we must 618 # be careful not to nest them, but group them properly 619 # Headings are always inserted into the root note! 620 if linesize.startswith("h1") or ( 621 linesize.startswith("h") and xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font 622 ): 623 if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None: 624 start = min(len(match.group(0)), len(obj.chars) - 1) 625 marker = match.group(1) 626 size = marker.count(".") + 2 627 else: 628 start = 0 629 marker = None 630 size = linesize[1] 631 name = f"head{size}" 632 # Check if we're already parsing a heading, do not split into two 633 if parent_name(current) != name or newlines > 2: 634 content_start = start 635 xpos = round(obj.chars[content_start].bbox.left) 636 current = Node(name, parent=root, obj=obj, xpos=xpos, size=size, marker=marker) 637 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 638 639 # Check if the line is a note and deal with the indentation correctly 640 elif ( 641 with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None 642 ): 643 content_start = min(len(match.group(0)), len(obj.chars) - 1) 644 # print(obj.fonts) 645 # Correct xposition only if the Note: string is very far left 646 if xpos + 4 * x_em <= current.xpos: 647 xpos = round(obj.chars[content_start].bbox.left) 648 # Prevent nesting of notes, they should only be listed 649 if parent_name(current) == "note": 650 current = current.parent.parent 651 current = unindent(xpos, current, 2) 652 current = Node( 653 "note", 654 parent=current, 655 obj=obj, 656 xpos=xpos, 657 type=match.group(1).lower(), 658 _font=obj.chars[content_start].font, 659 ) 660 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 661 662 # Check if line is Table or Figure caption 663 elif with_graphics and ( 664 (match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None 665 and "Bold" in obj.chars[0].font 666 ): 667 content_start = min(len(match.group(0)), len(obj.chars) - 1) 668 current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) 669 current = Node( 670 "caption", 671 parent=current, 672 obj=obj, 673 xpos=xpos, 674 _type=match.group(1).lower(), 675 number=int(match.group(2)), 676 ) 677 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 678 679 # Check if line is list and group them according to indentation 680 elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None: 681 current = unindent(xpos, current, newlines) 682 content_start = len(match.group(0)) - 2 683 xpos = round(obj.chars[content_start].bbox.left) 684 name = "listb" 685 value = lcontent[0] 686 if value in {"–", "-"}: 687 name = "lists" 688 elif value.isalpha(): 689 name = "lista" 690 elif value.isnumeric(): 691 name = "listn" 692 value = int(match.group(2)) 693 current = Node(name, parent=current, obj=obj, xpos=xpos, value=value) 694 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 695 696 # Check if line is a register bit definition 697 elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None: 698 if obj.contains_font("Bold"): 699 # Use the bold character as delimiter 700 content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font) 701 else: 702 # Default back to the regex 703 if "Reserved" not in content: 704 _LOGGER.warning( 705 f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}" 706 ) 707 content_start = re.match( 708 r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content 709 ) 710 if content_start is None: 711 _LOGGER.error(f"Unable to match Bit regex at all! '{content}'!") 712 content_start = 0 713 else: 714 content_start = len(content_start.group(0)) 715 if not content_start: 716 _LOGGER.error(f"Missing content start (=0)! '{content}'!") 717 content_start = min(content_start, len(obj.chars) - 1) 718 719 current = next((c for c in current.iter_path_reverse() if c.name.startswith("head")), root) 720 middle = obj.chars[content_start].bbox.left 721 xpos = round(middle) 722 current = Node( 723 "bit", 724 parent=current, 725 obj=obj, 726 xpos=xpos, 727 _page=self, 728 _middle=middle, 729 _left=area.left, 730 _right=area.right, 731 ) 732 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 733 734 # Check if this is a new paragraph 735 elif newlines >= 2 or current.name not in {"para"}: 736 # Fix issues where notes are reflowing back left of Note: text 737 if parent_name(current) in {"note"}: 738 if xpos < current.parent.xpos: 739 xpos = current.parent.xpos 740 # Prevent multiline 741 current = unindent(xpos, current, newlines) 742 current = Node("para", parent=current, obj=obj, xpos=xpos if current.is_root else current.xpos) 743 744 elif parent_name(current) not in {"caption", "bit", "area"}: 745 current = unindent(xpos, current, newlines) 746 747 # Add the actual line 748 Node("line", parent=current, obj=obj, xpos=xpos, start=content_start, str=content[content_start:50]) 749 750 ypos = obj.origin 751 752 return root
Convert the area content into an abstract syntax tree.
Parameters
- area: area to search for content.
- with_graphics: including graphics in the area.
Returns
An abstract syntax tree including the content formatting.
Inherited Members
- modm_data.pdf2html.page.Page
- text_in_named_area
- charlines_in_area
- graphic_bboxes_in_area
- objects_in_area
- content_objects
- content_graphics
- content_lines
- content_tables
- content_figures
- modm_data.pdf.page.Page
- index
- number
- label
- width
- height
- rotation
- bbox
- char_count
- char
- chars
- objlinks
- weblinks
- chars_in_area
- text_in_area
- structures
- find
- paths
- images
- graphic_clusters
- pypdfium2._helpers.page.PdfPage
- parent
- get_width
- get_height
- get_size
- get_rotation
- set_rotation
- get_mediabox
- set_mediabox
- get_cropbox
- set_cropbox
- get_bleedbox
- set_bleedbox
- get_trimbox
- set_trimbox
- get_artbox
- set_artbox
- get_bbox
- get_textpage
- insert_obj
- remove_obj
- gen_content
- get_objects
- render
- pypdfium2.internal.bases.AutoCloseable
- close