modm_data.pdf2html.stmicro.page
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4import re 5import math 6import logging 7import textwrap 8import statistics 9from functools import cached_property, cache, reduce 10from collections import defaultdict 11from .table import Table 12from ..figure import Figure 13from ..line import CharLine 14from ...utils import HLine, VLine, Rectangle, Region 15from ...pdf import Path, Image, Page as PdfPage 16from anytree import Node 17 18 19LOGGER = logging.getLogger(__name__) 20 21def is_compatible(document) -> bool: 22 if "stmicro" in document.metadata.get("Author", "").lower(): 23 return True 24 return False 25 26 27def areas_black_white(page) -> dict: 28 def _scale(r): 29 if page.rotation: 30 return Rectangle(r.bottom * page.width, (1 - r.right) * page.height, 31 r.top * page.width, (1 - r.left) * page.height) 32 return Rectangle(r.left * page.width, r.bottom * page.height, 33 r.right * page.width, r.top * page.height) 34 35 bottom_left = Rectangle(0.1, 0.1, 0.3, 0.12) 36 bottom_middle = Rectangle(0.3, 0.1, 0.7, 0.12) 37 bottom_right = Rectangle(0.7, 0.1, 0.9, 0.12) 38 top = Rectangle(0.1, 0.9125, 0.9, 0.9375) 39 content = Rectangle(0.025, 0.12, 0.975, 0.905 if page.index else 0.79) 40 all_content = [content] 41 areas = { 42 # Bottom string in the middle: Example "RM0410 Rev 4" 43 "id": bottom_middle, 44 } 45 if page.index == 0: 46 # Publish date on the bottom left on first page 47 areas["date"] = bottom_left 48 # number on the bottom right on first page 49 areas["number"] = bottom_right 50 # Add top areas 51 all_content.insert(0, Rectangle(0.375, 0.855, 0.975, 0.9125)) 52 all_content.insert(1, Rectangle(0.025, 0.805, 0.975, 0.855)) 53 else: 54 # Page number on bottom 55 areas["number"] = bottom_left if page.index % 2 else bottom_right 56 # Chapter name on top 57 areas["top"] = top 58 59 # Recognize the two column design of the Datasheets with a big table underneath 60 if page.index < 3 and "DS" in page.pdf.name: 61 # Find a wide path that would denote the beginning of a table 62 top_rect = [p.bbox.top / page.height for p in page.paths 63 if _scale(content).contains(p.bbox) and p.bbox.width > page.width * 0.75] 64 if top_rect: 65 # offset for table label just above it 66 ybottom = max(*top_rect) + 0.0175 67 else: 68 ybottom = content.bottom 69 # Try to find list or sublists in these areas 70 mr = Rectangle(0.49, ybottom, 0.51, content.top) 71 br = Rectangle(0.51, ybottom, 0.5325, content.top) 72 hr = Rectangle(0.5325, ybottom, 0.555, content.top) 73 text_middle = page.text_in_area(_scale(mr)) 74 text_bullets = page.text_in_area(_scale(br)) 75 text_hyphens = page.text_in_area(_scale(hr)) 76 if (not text_middle and 77 (any(c in text_bullets for c in {"•", chr(61623)}) or 78 any(c in text_hyphens for c in {"-"}))): 79 areas["middle_bullets"] = br 80 areas["middle_hyphens"] = hr 81 all_content = all_content[:-1] 82 all_content.append(Rectangle(content.left, ybottom, 0.5, content.top)) 83 all_content.append(Rectangle(0.505, ybottom, content.right, content.top)) 84 if top_rect: 85 all_content.append(Rectangle(content.left, content.bottom, content.right, ybottom)) 86 87 areas["content"] = all_content 88 scaled_areas = {} 89 for name, area in areas.items(): 90 if isinstance(area, list): 91 scaled_areas[name] = [_scale(r) for r in area] 92 else: 93 scaled_areas[name] = _scale(area) 94 return scaled_areas 95 96 97def areas_blue_gray(page) -> dict: 98 def _scale(r): 99 return Rectangle(r.left * page.width, r.bottom * page.height, 100 r.right * page.width, r.top * page.height) 101 102 # This template doesn't use rotated pages, instead uses 103 # hardcoded rotated page dimensions 104 if page.width > page.height: 105 content = Rectangle(0.05, 0.025, 0.89, 0.975) 106 bottom_left = Rectangle(0, 0.6, 0.05, 1) 107 top_right = Rectangle(0.9025, 0.05, 0.9175, 0.7) 108 else: 109 content = Rectangle(0.025, 0.05, 0.975, 0.89 if page.index else 0.81) 110 bottom_left = Rectangle(0, 0, 0.4, 0.05) 111 top_right = Rectangle(0.3, 0.9025, 0.95, 0.9175) 112 areas = { 113 "id": bottom_left, 114 "top": top_right, 115 "all_content": content, 116 "content": [] 117 } 118 if page.index == 0: 119 areas["content"] = [ 120 # Document device string 121 Rectangle(0.4, 0.91, 0.95, 0.95), 122 # Document description string 123 Rectangle(0.05, 0.81, 0.95, 0.86) 124 ] 125 if page.index < 10: 126 # Contains only a table with product summary 127 br = Rectangle(0.35, content.bottom, 0.37, content.top) 128 text_bullets = page.text_in_area(_scale(br)) 129 if any(c in text_bullets for c in {"•", chr(61623)}): 130 areas["middle_bullets"] = br 131 # Contains the actual content here 132 left = Rectangle(content.left, content.bottom, 0.3565, content.top) 133 right = Rectangle(0.3565, content.bottom, content.right, content.top) 134 areas["content"].extend([left, right]) 135 else: 136 areas["content"] = [content] 137 else: 138 areas["content"] = [content] 139 140 scaled_areas = {} 141 for name, area in areas.items(): 142 if isinstance(area, list): 143 scaled_areas[name] = [_scale(r) for r in area] 144 else: 145 scaled_areas[name] = _scale(area) 146 return scaled_areas 147 148 149def spacing_black_white(page) -> dict: 150 content = 0.1125 151 spacing = { 152 # Horizontal spacing: left->right 153 "x_em": 0.01 * page.width, 154 "x_left": content * page.width, 155 "x_right": (1 - content) * page.width, 156 "x_content": 0.2075 * page.width, 157 # Vertical spacing: bottom->top 158 "y_em": 0.01 * page.height, 159 # Max table line thickness 160 "y_tline": 0.005 * page.height, 161 # Max line height distance to detect paragraphs 162 "lh": 0.9, 163 # Max line height distance to detect super-/subscript 164 "sc": 0.325, 165 # Table header cell bold text threshold 166 "th": 0.33, 167 } 168 if page.rotation: 169 content = 0.14 170 spacing.update({ 171 "x_em": 0.01 * page.height, 172 "y_em": 0.01 * page.width, 173 "x_left": content * page.width, 174 "x_right": (1 - content) * page.width, 175 "x_content": 0.2075 * page.width, 176 "y_tline": 0.005 * page.width, 177 "lh": 1.2, 178 "sc": 0.4, 179 }) 180 return spacing 181 182 183def spacing_blue_gray(page) -> dict: 184 content = 0.07 185 spacing = { 186 # Horizontal spacing: left->right 187 "x_em": 0.01 * page.width, 188 "x_left": content * page.width, 189 "x_right": (1 - content) * page.width, 190 "x_content": 0.165 * page.width, 191 # Vertical spacing: bottom->top 192 "y_em": 0.01 * page.height, 193 # Max table line thickness 194 "y_tline": 0.005 * page.height, 195 # Max line height distance to detect paragraphs 196 "lh": 0.9, 197 # Max line height distance to detect super-/subscript 198 "sc": 0.3, 199 # Table header cell bold text threshold 200 "th": 0.33, 201 } 202 if page.rotation: 203 spacing.update({ 204 "x_em": 0.01 * page.height, 205 "y_em": 0.01 * page.width, 206 "x_left": 0.05 * page.width, 207 "x_right": (1 - 0.16) * page.width, 208 "x_content": 0.2075 * page.width, 209 "y_tline": 0.005 * page.width, 210 "lh": 1.6, 211 "sc": 0.2, 212 }) 213 return spacing 214 215 216def linesize_black_white(line: float) -> str: 217 rsize = line.height 218 if rsize >= 17.5: return "h1" 219 elif rsize >= 15.5: return "h2" 220 elif rsize >= 13.5: return "h3" 221 elif rsize >= 11.4: return "h4" 222 elif rsize >= 8.5: return "n" 223 else: return "fn" 224 225 226def linesize_blue_gray(line: float) -> str: 227 rsize = round(line.height) 228 if rsize >= 16: return "h1" 229 elif rsize >= 14: return "h2" 230 elif rsize >= 12: return "h3" 231 elif rsize >= 10: return "h4" 232 elif rsize >= 7: return "n" 233 else: return "fn" 234 235 236def colors_black_white(color: int) -> str: 237 if 0xff <= color <= 0xff: 238 return "black" 239 if 0xffffffff <= color <= 0xffffffff: 240 return "white" 241 return "unknown" 242 243 244def colors_blue_gray(color: int) -> str: 245 if 0xff <= color <= 0xff: 246 return "black" 247 if 0xffffffff <= color <= 0xffffffff: 248 return "white" 249 if 0xb9c4caff <= color <= 0xb9c4caff: 250 return "gray" 251 if 0x1f81afff <= color <= 0x1f81afff: 252 return "lightblue" 253 if 0x2052ff <= color <= 0x2052ff: 254 return "darkblue" 255 if 0x39a9dcff <= color <= 0x39a9dcff: 256 return "blue" 257 return "unknown" 258 259 260class Page(PdfPage): 261 262 def __init__(self, document, index: int): 263 super().__init__(document, index) 264 self._template = "black_white" 265 producer = self.pdf.metadata.get("Producer", "").lower() 266 if "acrobat" in producer: 267 pass # default 268 elif "antenna" in producer: 269 self._template = "blue_gray" 270 else: 271 LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") 272 273 if "blue_gray" in self._template: 274 self._areas = areas_blue_gray(self) 275 self._spacing = spacing_blue_gray(self) 276 self._colors = colors_blue_gray 277 self._line_size = linesize_blue_gray 278 elif "black_white" in self._template: 279 self._areas = areas_black_white(self) 280 self._spacing = spacing_black_white(self) 281 self._colors = colors_black_white 282 self._line_size = linesize_black_white 283 284 # Patches to detect the header cells correctly 285 if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or 286 (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))): 287 self._spacing["th"] = 0.1 288 if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or 289 (self.pdf.name == "RM0456-v2" and self.index in [2881]) or 290 (self.pdf.name == "RM0456-v3" and self.index in [2880]) or 291 (self.pdf.name == "RM0461-v4" and self.index in [1246])): 292 self._spacing["th"] = 0.5 293 if ((self.pdf.name == "RM0456-v2" and self.index in [3005])): 294 self._spacing["th"] = 0.52 295 296 def _text_in_area(self, name, check_length=True) -> str: 297 if name not in self._areas: return "" 298 text = "" 299 areas = self._areas[name] 300 if not isinstance(areas, list): areas = [areas] 301 for area in areas: 302 text += self.text_in_area(area) 303 if check_length: assert text 304 return text 305 306 @cached_property 307 def identifier(self) -> str: 308 return self._text_in_area("id", check_length=False) 309 310 @cached_property 311 def top(self) -> str: 312 if self.index == 0: 313 return "Cover" 314 return self._text_in_area("top", check_length=False) 315 316 def is_relevant(self) -> bool: 317 if any(c in self.top for c in {"Contents", "List of ", "Index"}): 318 return False 319 return True 320 321 def _charlines_filtered(self, area, predicate = None, rtol = None) -> list[CharLine]: 322 if rtol is None: rtol = self._spacing["sc"] 323 # Split all chars into lines based on rounded origin 324 origin_lines_y = defaultdict(list) 325 origin_lines_x = defaultdict(list) 326 for char in self.chars_in_area(area): 327 # Ignore all characters we don't want 328 if predicate is not None and not predicate(char): 329 continue 330 # Ignore Carriage Return characters and ® (superscript issues) 331 if char.unicode in {0xd, ord("®")}: 332 continue 333 # Correct some weird unicode stuffing choices 334 if char.unicode in {2}: 335 char.unicode = ord("-") 336 if char.unicode in {61623, 61664}: 337 char.unicode = ord("•") 338 if char.unicode < 32 and char.unicode not in {0xa}: 339 continue 340 # Ignore characters without width that are not spaces 341 if not char.width and char.unicode not in {0xa, 0xd, 0x20}: 342 LOGGER.error(f"Unknown char width for {char}: {char.bbox}") 343 # Split up the chars depending on the orientation 344 if 45 < char.rotation <= 135 or 225 < char.rotation <= 315: 345 origin_lines_x[round(char.origin.x, 1)].append(char) 346 elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation: 347 origin_lines_y[round(char.origin.y, 1)].append(char) 348 else: 349 LOGGER.error("Unknown char rotation:", char, char.rotation) 350 351 # Convert characters into lines 352 bbox_lines_y = [] 353 for chars in origin_lines_y.values(): 354 # Remove lines with whitespace only 355 if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): 356 continue 357 origin = statistics.fmean(c.origin.y for c in chars) 358 line = CharLine(self, chars, 359 min(c.bbox.bottom for c in chars), 360 origin, 361 max(c.bbox.top for c in chars), 362 max(c.height for c in chars), 363 sort_origin=self.height - origin) 364 bbox_lines_y.append(line) 365 # print(line, line.top, line.origin, line.bottom, line.height) 366 bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin) 367 368 bbox_lines_x = [] 369 for chars in origin_lines_x.values(): 370 # Remove lines with whitespace only 371 if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): 372 continue 373 line = CharLine(self, chars, 374 min(c.bbox.left for c in chars), 375 statistics.fmean(c.origin.x for c in chars), 376 max(c.bbox.right for c in chars), 377 max(c.width for c in chars), 378 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90) 379 bbox_lines_x.append(line) 380 bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin) 381 382 if not bbox_lines: 383 return [] 384 385 # Merge lines that have overlapping bbox_lines 386 # FIXME: This merges lines that "collide" vertically like in formulas 387 merged_lines = [] 388 current_line = bbox_lines[0] 389 for next_line in bbox_lines[1:]: 390 height = max(current_line.height, next_line.height) 391 # Calculate overlap via normalize origin (increasing with line index) 392 if ((current_line._sort_origin + rtol * height) > 393 (next_line._sort_origin - rtol * height)): 394 # if line.rotation or self.rotation: 395 # # The next line overlaps this one, we merge the shorter line 396 # # (typically super- and subscript) into taller line 397 # use_current = len(current_line.chars) >= len(next_line.chars) 398 # else: 399 use_current = current_line.height >= next_line.height 400 line = current_line if use_current else next_line 401 current_line = CharLine(self, current_line.chars + next_line.chars, 402 line.bottom, line.origin, line.top, 403 height, line.rotation, 404 sort_origin=line._sort_origin) 405 else: 406 # The next line does not overlap the current line 407 merged_lines.append(current_line) 408 current_line = next_line 409 # append last line 410 merged_lines.append(current_line) 411 412 # Sort all lines horizontally based on character origin 413 sorted_lines = [] 414 for line in merged_lines: 415 if line.rotation == 90: 416 def sort_key(char): 417 if char.unicode in {0xa, 0xd}: 418 return char.tbbox.midpoint.y - 1e9 419 return char.tbbox.midpoint.y 420 elif line.rotation == 270: 421 def sort_key(char): 422 if char.unicode in {0xa, 0xd}: 423 return -char.tbbox.midpoint.y + 1e9 424 return -char.tbbox.midpoint.y 425 else: 426 def sort_key(char): 427 if char.unicode in {0xa, 0xd}: 428 return char.origin.x + 1e9 429 return char.origin.x 430 sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key), 431 line.bottom, line.origin, 432 line.top, line.height, 433 line.rotation, area.left, 434 sort_origin=line._sort_origin)) 435 436 return sorted_lines 437 438 def _content_areas(self, area: Rectangle, with_graphics: bool = True) -> list: 439 if with_graphics: 440 graphics = self._graphics_filtered(area) 441 regions = [] 442 for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)): 443 gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox 444 for reg in regions: 445 if reg.overlaps(gbbox.bottom, gbbox.top): 446 # They overlap, so merge them 447 reg.v0 = min(reg.v0, gbbox.bottom) 448 reg.v1 = max(reg.v1, gbbox.top) 449 reg.objs.append(graphic) 450 break 451 else: 452 regions.append(Region(gbbox.bottom, gbbox.top, graphic)) 453 454 # print(regions) 455 areas = [] 456 ypos = area.top 457 for reg in regions: 458 if ypos - reg.v1 > self._spacing["y_em"]: 459 areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None)) 460 for obj in reg.objs: 461 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 462 areas.append((oarea, obj)) 463 ypos = reg.v0 464 areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None)) 465 else: 466 areas = [(area, None)] 467 return areas 468 469 def _objects_filtered(self, area: Rectangle, with_graphics: bool = True) -> list: 470 self._link_characters() 471 areas = self._content_areas(area, with_graphics) 472 objects = [] 473 for narea, obj in areas: 474 if obj is None: 475 objects += self._charlines_filtered(narea) 476 else: 477 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 478 predicate = lambda c: not obj.bbox.contains(c.origin) 479 lines = self._charlines_filtered(oarea, predicate) 480 # print(obj, oarea, lines, [line.content for line in lines]) 481 objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x))) 482 return objects 483 484 @property 485 def content_ast(self) -> list: 486 ast = [] 487 with_graphics = True 488 if "DS" in self.pdf.name: 489 # FIXME: Terrible hack to get the ordering information table fixed 490 # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable 491 order_page = next((item.page_index for item in self.pdf.toc if item.level == 0 and 492 re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1) 493 with_graphics = (order_page != self.index) 494 for area in self._areas["content"]: 495 ast.append(self._ast_filtered(area, with_graphics=with_graphics)) 496 # Add a page node to the first leaf to keep track of where a page starts 497 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 498 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 499 return ast 500 501 def _graphics_filtered(self, area) -> list: 502 # Find all graphic clusters in this area 503 em = self._spacing["y_em"] 504 large_area = area.offset_x(em/2) 505 graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em/2) 506 # for bbox, paths in raw_graphic_clusters: 507 # # Some docs have large DRAFT chars in the background 508 # if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths): 509 # continue 510 # graphic_clusters.append((bbox, paths)) 511 512 # Find the captions and group them by y origin to catch side-by-side figures 513 ycaptions = defaultdict(list) 514 for line in self._charlines_filtered(area, lambda c: "Bold" in c.font): 515 for cluster in line.clusters(): 516 for phrase in [r"Figure \d+\.", r"Table \d+\."]: 517 if re.match(phrase, cluster.content): 518 ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars)) 519 ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)] 520 521 # Now associate these captions with the graphics bboxes 522 categories = [] 523 for captions in ycaptions: 524 width = area.width / len(captions) 525 for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)): 526 left, right = area.left + ii * width, area.left + (ii + 1) * width 527 bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height 528 529 # Find the graphic associated with this caption 530 graphic = next(((b, p) for b, p in graphic_clusters 531 if b.bottom <= bottom and 532 left <= b.left and b.right <= right), None) 533 if graphic is None: 534 LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}") 535 continue 536 537 if self._template == "blue_gray": 538 # Search for all lines of the current caption with the same properties 539 cbbox = Rectangle(left, bottom, right, top) 540 cchars = self.chars_in_area(cbbox) 541 while True: 542 nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top) 543 nchars = self.chars_in_area(nbbox) 544 if len(cchars) >= len(nchars): 545 break 546 cbbox = nbbox 547 cchars = nchars 548 elif self._template == "black_white": 549 cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top) 550 551 otype = phrase.split(" ")[0].lower() 552 if "Figure" in phrase: 553 # Find all other graphics in the bounding box 554 gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom) 555 graphics = [] 556 for b, p in graphic_clusters: 557 if gbbox.overlaps(b): 558 graphics.append((b,p)) 559 for g in graphics: 560 graphic_clusters.remove(g) 561 gbbox = [cluster[0] for cluster in graphics] 562 gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox) 563 paths = [p for cluster in graphics for p in cluster[1]] 564 565 if self._template == "blue_gray": 566 # Search for characters below the graphics bbox, max 1 y_em 567 gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom) 568 while True: 569 gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom) 570 if not self.chars_in_area(gbbox): 571 break 572 # Generate the new bounding box which includes the caption 573 gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom) 574 elif "Table" in phrase: 575 graphic_clusters.remove(graphic) 576 gbbox, paths = graphic 577 if (self._template == "black_white" and 578 sum(1 for path in paths if path.count == 2) >= len(paths) / 2): 579 otype += "_lines" 580 categories.append((otype, cbbox, gbbox, paths)) 581 582 # Deal with the remaining graphic categories 583 for gbbox, paths in graphic_clusters: 584 if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]: 585 continue 586 if any(isinstance(p, Image) for p in paths): 587 category = "figure" 588 elif self._template == "blue_gray": 589 if all(self._colors(path.stroke) == "gray" or 590 self._colors(path.fill) == "darkblue" for path in paths): 591 category = "table" 592 else: 593 category = "figure" 594 elif self._template == "black_white": 595 # Some tables are rendered explicitly with filled rectangular 596 # shapes with others are implicitly rendered with stroked lines 597 stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2 598 is_table = stroked_table_lines or all( 599 [any(p.isclose(pp) for pp in path.bbox.points) 600 for p in path.points].count(True) >= len(path.points) * 2 / 3 601 for path in paths) 602 if (len(paths) > 1 and is_table): 603 category = "table" 604 if stroked_table_lines: 605 category += "_lines" 606 else: 607 category = "figure" 608 609 if "table" in category: 610 # Check if there are only numbers on top of the table 611 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"]) 612 nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xa, 0xd}] 613 614 if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3: 615 # This is a register table with invisible top borders! 616 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars)) 617 gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top) 618 name = "register_" + category 619 else: 620 cbbox = None 621 name = category 622 categories.append((name, cbbox, gbbox, paths)) 623 else: 624 categories.append(("figure", None, gbbox, paths)) 625 626 # Convert the objects into specialized classes 627 categories.sort(key=lambda o: (-o[2].y, o[2].x)) 628 objects = [] 629 for otype, caption_bbox, graphics_bbox, graphics_paths in categories: 630 if "figure" in otype: 631 figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths) 632 objects.append(figure) 633 elif "table" in otype: 634 xlines, ylines, yhlines = [], [], [] 635 for path in graphics_paths: 636 if self._template == "blue_gray" or "_lines" in otype: 637 if self._colors(path.stroke) == "gray" or "_lines" in otype: 638 # Intercell paths in gray 639 if len(path.lines) == 1: 640 line = path.lines[0] 641 if line.direction == line.Direction.VERTICAL: 642 xlines.append(line.specialize()) 643 elif line.direction == line.Direction.HORIZONTAL: 644 ylines.append(line.specialize()) 645 else: 646 LOGGER.warn(f"Line not vertical or horizontal: {line}") 647 else: 648 LOGGER.warn(f"Path too long: {path}") 649 elif self._colors(path.fill) == "darkblue": 650 # Add the bottom line of the dark blue header box as a very thick line 651 line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5) 652 yhlines.append(line) 653 654 elif self._template == "black_white": 655 bbox = path.bbox 656 is_vertical = bbox.width < bbox.height 657 width = bbox.width if is_vertical else bbox.height 658 length = bbox.height if is_vertical else bbox.width 659 if width <= self._spacing["x_em"] / 2: 660 if length >= self._spacing["y_em"] / 2: 661 if is_vertical: 662 line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width) 663 xlines.append(line) 664 else: 665 line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height) 666 ylines.append(line) 667 else: 668 # Split the rectangle into it's outline 669 xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1)) 670 xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1)) 671 ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1)) 672 ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1)) 673 if yhlines: 674 yhlines.sort(key=lambda l: l.p0.y) 675 ylines.append(yhlines[0]) 676 if not xlines or not ylines: 677 continue 678 table = Table(self, graphics_bbox, xlines, ylines, caption_bbox, 679 is_register="register" in otype) 680 objects.append(table) 681 682 return objects 683 684 @property 685 def content_objects(self) -> list: 686 objs = [] 687 for area in self._areas["content"]: 688 objs.extend(self._objects_filtered(area)) 689 return objs 690 691 @property 692 def content_graphics(self) -> list: 693 objs = [] 694 for area in self._areas["content"]: 695 objs.extend(self._graphics_filtered(area)) 696 return objs 697 698 @property 699 def content_lines(self) -> list: 700 return [o for o in self.content_objects if isinstance(o, CharLine)] 701 702 @property 703 def content_tables(self) -> list: 704 return [o for o in self.content_graphics if isinstance(o, Table)] 705 706 @property 707 def content_figures(self) -> list: 708 return [o for o in self.content_graphics if isinstance(o, Figure)] 709 710 def _char_properties(self, line, char): 711 cp = { 712 "superscript": False, 713 "subscript": False, 714 "bold": any(frag in char.font for frag in {"Bold"}), 715 "italic": any(frag in char.font for frag in {"Italic", "Oblique"}), 716 "underline": (char.objlink or char.weblink) is not None, 717 "size": round(line.height), 718 "relsize": self._line_size(line), 719 "char": chr(char.unicode), 720 } 721 722 if line.rotation: 723 if char.origin.x < (line.origin - 0.25 * line.height): 724 cp["superscript"] = True 725 elif char.origin.x > (line.origin + 0.15 * line.height): 726 cp["subscript"] = True 727 elif char.origin.y > (line.origin + 0.25 * line.height): 728 cp["superscript"] = True 729 elif char.origin.y < (line.origin - 0.15 * line.height): 730 cp["subscript"] = True 731 732 return cp 733 734 def _ast_filtered(self, area: Rectangle, with_graphics=True, 735 ignore_xpos=False, with_bits=True, with_notes=True) -> list: 736 x_em = self._spacing["x_em"] 737 spacing_content = self._spacing["x_content"] 738 lh_factor = self._spacing["lh"] 739 # spacing_y = self._spacing["y_em"] 740 root = Node("area", obj=area, xpos=int(area.left), page=self) 741 742 def unindent(_xpos, _current, _newlines=1): 743 current = _current 744 # Check if we need to unindent the current node 745 while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos: 746 current = current.parent 747 if _newlines >= 2 and current.name == "para": 748 current = current.parent 749 return current 750 751 def parent_name(current): 752 return "" if current.parent is None else current.parent.name 753 754 current = root 755 ypos = area.top 756 for obj in self._objects_filtered(area, with_graphics): 757 xpos = round(obj.bbox.left) 758 # Tables should remain in their current hierarchy regardless of indentation 759 if isinstance(obj, (Table, Figure)): 760 current = next((c for c in current.iter_path_reverse() 761 if c.name.startswith("head")), root) 762 name = "figure" if isinstance(obj, Figure) else "table" 763 Node(name, parent=current, obj=obj, xpos=xpos, number=-1, 764 _width=obj.bbox.width / area.width, _type=obj._type) 765 ypos = obj.bbox.bottom 766 # Lines of text need to be carefully checked for indentation 767 elif isinstance(obj, CharLine): 768 newlines = round((ypos - obj.origin) / (lh_factor * obj.height)) 769 content = obj.content 770 lcontent = content.lstrip() 771 content_start = 0 772 linesize = self._line_size(obj) 773 774 # Check when the note has finished (=> paragraphs without italic) 775 if (parent_name(current) == "note" and 776 ((current.parent.type == "note" and not obj.contains_font(current.parent._font)) or 777 (current.parent.type in {"caution", "warning"} and newlines >= 2))): 778 current = current.parent.parent 779 780 # Check when the list ends into something indented far too right 781 elif (parent_name(current).startswith("list") 782 and (xpos - current.xpos) >= 2 * x_em): 783 current = current.parent.parent 784 785 # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content) 786 # Check if line is a heading, which may be multi-line, so we must 787 # be careful not to nest them, but group them properly 788 # Headings are always inserted into the root note! 789 if linesize.startswith("h1") or (linesize.startswith("h") and 790 xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font): 791 if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None: 792 start = min(len(match.group(0)), len(obj.chars) - 1) 793 marker = match.group(1) 794 size = marker.count('.') + 2 795 else: 796 start = 0 797 marker = None 798 size = linesize[1] 799 name = f"head{size}" 800 # Check if we're already parsing a heading, do not split into two 801 if parent_name(current) != name or newlines > 2: 802 content_start = start 803 xpos = round(obj.chars[content_start].bbox.left) 804 current = Node(name, parent=root, obj=obj, xpos=xpos, 805 size=size, marker=marker) 806 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 807 808 # Check if the line is a note and deal with the indentation correctly 809 elif with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None: 810 content_start = min(len(match.group(0)), len(obj.chars) - 1) 811 # print(obj.fonts) 812 # Correct xposition only if the Note: string is very far left 813 if xpos + 4 * x_em <= current.xpos: 814 xpos = round(obj.chars[content_start].bbox.left) 815 # Prevent nesting of notes, they should only be listed 816 if parent_name(current) == "note": 817 current = current.parent.parent 818 current = unindent(xpos, current, 2) 819 current = Node("note", parent=current, obj=obj, xpos=xpos, 820 type=match.group(1).lower(), _font=obj.chars[content_start].font) 821 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 822 823 # Check if line is Table or Figure caption 824 elif with_graphics and ((match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None 825 and "Bold" in obj.chars[0].font): 826 content_start = min(len(match.group(0)), len(obj.chars) - 1) 827 current = next((c for c in current.iter_path_reverse() 828 if c.name.startswith("head")), root) 829 current = Node("caption", parent=current, obj=obj, xpos=xpos, 830 _type=match.group(1).lower(), number=int(match.group(2))) 831 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 832 833 # Check if line is list and group them according to indentation 834 elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None: 835 current = unindent(xpos, current, newlines) 836 content_start = len(match.group(0)) - 2 837 xpos = round(obj.chars[content_start].bbox.left) 838 name = "listb" 839 value = lcontent[0] 840 if value in {"–", "-"}: name = "lists" 841 elif value.isalpha(): name = "lista" 842 elif value.isnumeric(): 843 name = "listn" 844 value = int(match.group(2)) 845 current = Node(name, parent=current, obj=obj, xpos=xpos, value=value) 846 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 847 848 # Check if line is a register bit definition 849 elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None: 850 if obj.contains_font("Bold"): 851 # Use the bold character as delimiter 852 content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font) 853 else: 854 # Default back to the regex 855 if "Reserved" not in content: 856 LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}") 857 content_start = re.match(r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content) 858 if content_start is None: 859 LOGGER.error(f"Unable to match Bit regex at all! '{content}'!") 860 content_start = 0 861 else: 862 content_start = len(content_start.group(0)) 863 if not content_start: 864 LOGGER.error(f"Missing content start (=0)! '{content}'!") 865 content_start = min(content_start, len(obj.chars) - 1) 866 867 current = next((c for c in current.iter_path_reverse() 868 if c.name.startswith("head")), root) 869 middle = obj.chars[content_start].bbox.left 870 xpos = round(middle) 871 current = Node("bit", parent=current, obj=obj, xpos=xpos, _page=self, 872 _middle=middle, _left=area.left, _right=area.right) 873 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 874 875 # Check if this is a new paragraph 876 elif newlines >= 2 or current.name not in {"para"}: 877 # Fix issues where notes are reflowing back left of Note: text 878 if parent_name(current) in {"note"}: 879 if xpos < current.parent.xpos: 880 xpos = current.parent.xpos 881 # Prevent multiline 882 current = unindent(xpos, current, newlines) 883 current = Node("para", parent=current, obj=obj, 884 xpos=xpos if current.is_root else current.xpos) 885 886 elif (parent_name(current) not in {"caption", "bit", "area"}): 887 current = unindent(xpos, current, newlines) 888 889 # Add the actual line 890 Node("line", parent=current, obj=obj, xpos=xpos, 891 start=content_start, str=content[content_start:50]) 892 893 ypos = obj.origin 894 895 return root 896 897 def __repr__(self) -> str: 898 return f"StPage({self.number})"
LOGGER =
<Logger modm_data.pdf2html.stmicro.page (WARNING)>
def
is_compatible(document) -> bool:
def
areas_black_white(page) -> dict:
28def areas_black_white(page) -> dict: 29 def _scale(r): 30 if page.rotation: 31 return Rectangle(r.bottom * page.width, (1 - r.right) * page.height, 32 r.top * page.width, (1 - r.left) * page.height) 33 return Rectangle(r.left * page.width, r.bottom * page.height, 34 r.right * page.width, r.top * page.height) 35 36 bottom_left = Rectangle(0.1, 0.1, 0.3, 0.12) 37 bottom_middle = Rectangle(0.3, 0.1, 0.7, 0.12) 38 bottom_right = Rectangle(0.7, 0.1, 0.9, 0.12) 39 top = Rectangle(0.1, 0.9125, 0.9, 0.9375) 40 content = Rectangle(0.025, 0.12, 0.975, 0.905 if page.index else 0.79) 41 all_content = [content] 42 areas = { 43 # Bottom string in the middle: Example "RM0410 Rev 4" 44 "id": bottom_middle, 45 } 46 if page.index == 0: 47 # Publish date on the bottom left on first page 48 areas["date"] = bottom_left 49 # number on the bottom right on first page 50 areas["number"] = bottom_right 51 # Add top areas 52 all_content.insert(0, Rectangle(0.375, 0.855, 0.975, 0.9125)) 53 all_content.insert(1, Rectangle(0.025, 0.805, 0.975, 0.855)) 54 else: 55 # Page number on bottom 56 areas["number"] = bottom_left if page.index % 2 else bottom_right 57 # Chapter name on top 58 areas["top"] = top 59 60 # Recognize the two column design of the Datasheets with a big table underneath 61 if page.index < 3 and "DS" in page.pdf.name: 62 # Find a wide path that would denote the beginning of a table 63 top_rect = [p.bbox.top / page.height for p in page.paths 64 if _scale(content).contains(p.bbox) and p.bbox.width > page.width * 0.75] 65 if top_rect: 66 # offset for table label just above it 67 ybottom = max(*top_rect) + 0.0175 68 else: 69 ybottom = content.bottom 70 # Try to find list or sublists in these areas 71 mr = Rectangle(0.49, ybottom, 0.51, content.top) 72 br = Rectangle(0.51, ybottom, 0.5325, content.top) 73 hr = Rectangle(0.5325, ybottom, 0.555, content.top) 74 text_middle = page.text_in_area(_scale(mr)) 75 text_bullets = page.text_in_area(_scale(br)) 76 text_hyphens = page.text_in_area(_scale(hr)) 77 if (not text_middle and 78 (any(c in text_bullets for c in {"•", chr(61623)}) or 79 any(c in text_hyphens for c in {"-"}))): 80 areas["middle_bullets"] = br 81 areas["middle_hyphens"] = hr 82 all_content = all_content[:-1] 83 all_content.append(Rectangle(content.left, ybottom, 0.5, content.top)) 84 all_content.append(Rectangle(0.505, ybottom, content.right, content.top)) 85 if top_rect: 86 all_content.append(Rectangle(content.left, content.bottom, content.right, ybottom)) 87 88 areas["content"] = all_content 89 scaled_areas = {} 90 for name, area in areas.items(): 91 if isinstance(area, list): 92 scaled_areas[name] = [_scale(r) for r in area] 93 else: 94 scaled_areas[name] = _scale(area) 95 return scaled_areas
def
areas_blue_gray(page) -> dict:
98def areas_blue_gray(page) -> dict: 99 def _scale(r): 100 return Rectangle(r.left * page.width, r.bottom * page.height, 101 r.right * page.width, r.top * page.height) 102 103 # This template doesn't use rotated pages, instead uses 104 # hardcoded rotated page dimensions 105 if page.width > page.height: 106 content = Rectangle(0.05, 0.025, 0.89, 0.975) 107 bottom_left = Rectangle(0, 0.6, 0.05, 1) 108 top_right = Rectangle(0.9025, 0.05, 0.9175, 0.7) 109 else: 110 content = Rectangle(0.025, 0.05, 0.975, 0.89 if page.index else 0.81) 111 bottom_left = Rectangle(0, 0, 0.4, 0.05) 112 top_right = Rectangle(0.3, 0.9025, 0.95, 0.9175) 113 areas = { 114 "id": bottom_left, 115 "top": top_right, 116 "all_content": content, 117 "content": [] 118 } 119 if page.index == 0: 120 areas["content"] = [ 121 # Document device string 122 Rectangle(0.4, 0.91, 0.95, 0.95), 123 # Document description string 124 Rectangle(0.05, 0.81, 0.95, 0.86) 125 ] 126 if page.index < 10: 127 # Contains only a table with product summary 128 br = Rectangle(0.35, content.bottom, 0.37, content.top) 129 text_bullets = page.text_in_area(_scale(br)) 130 if any(c in text_bullets for c in {"•", chr(61623)}): 131 areas["middle_bullets"] = br 132 # Contains the actual content here 133 left = Rectangle(content.left, content.bottom, 0.3565, content.top) 134 right = Rectangle(0.3565, content.bottom, content.right, content.top) 135 areas["content"].extend([left, right]) 136 else: 137 areas["content"] = [content] 138 else: 139 areas["content"] = [content] 140 141 scaled_areas = {} 142 for name, area in areas.items(): 143 if isinstance(area, list): 144 scaled_areas[name] = [_scale(r) for r in area] 145 else: 146 scaled_areas[name] = _scale(area) 147 return scaled_areas
def
spacing_black_white(page) -> dict:
150def spacing_black_white(page) -> dict: 151 content = 0.1125 152 spacing = { 153 # Horizontal spacing: left->right 154 "x_em": 0.01 * page.width, 155 "x_left": content * page.width, 156 "x_right": (1 - content) * page.width, 157 "x_content": 0.2075 * page.width, 158 # Vertical spacing: bottom->top 159 "y_em": 0.01 * page.height, 160 # Max table line thickness 161 "y_tline": 0.005 * page.height, 162 # Max line height distance to detect paragraphs 163 "lh": 0.9, 164 # Max line height distance to detect super-/subscript 165 "sc": 0.325, 166 # Table header cell bold text threshold 167 "th": 0.33, 168 } 169 if page.rotation: 170 content = 0.14 171 spacing.update({ 172 "x_em": 0.01 * page.height, 173 "y_em": 0.01 * page.width, 174 "x_left": content * page.width, 175 "x_right": (1 - content) * page.width, 176 "x_content": 0.2075 * page.width, 177 "y_tline": 0.005 * page.width, 178 "lh": 1.2, 179 "sc": 0.4, 180 }) 181 return spacing
def
spacing_blue_gray(page) -> dict:
184def spacing_blue_gray(page) -> dict: 185 content = 0.07 186 spacing = { 187 # Horizontal spacing: left->right 188 "x_em": 0.01 * page.width, 189 "x_left": content * page.width, 190 "x_right": (1 - content) * page.width, 191 "x_content": 0.165 * page.width, 192 # Vertical spacing: bottom->top 193 "y_em": 0.01 * page.height, 194 # Max table line thickness 195 "y_tline": 0.005 * page.height, 196 # Max line height distance to detect paragraphs 197 "lh": 0.9, 198 # Max line height distance to detect super-/subscript 199 "sc": 0.3, 200 # Table header cell bold text threshold 201 "th": 0.33, 202 } 203 if page.rotation: 204 spacing.update({ 205 "x_em": 0.01 * page.height, 206 "y_em": 0.01 * page.width, 207 "x_left": 0.05 * page.width, 208 "x_right": (1 - 0.16) * page.width, 209 "x_content": 0.2075 * page.width, 210 "y_tline": 0.005 * page.width, 211 "lh": 1.6, 212 "sc": 0.2, 213 }) 214 return spacing
def
linesize_black_white(line: float) -> str:
def
linesize_blue_gray(line: float) -> str:
def
colors_black_white(color: int) -> str:
def
colors_blue_gray(color: int) -> str:
245def colors_blue_gray(color: int) -> str: 246 if 0xff <= color <= 0xff: 247 return "black" 248 if 0xffffffff <= color <= 0xffffffff: 249 return "white" 250 if 0xb9c4caff <= color <= 0xb9c4caff: 251 return "gray" 252 if 0x1f81afff <= color <= 0x1f81afff: 253 return "lightblue" 254 if 0x2052ff <= color <= 0x2052ff: 255 return "darkblue" 256 if 0x39a9dcff <= color <= 0x39a9dcff: 257 return "blue" 258 return "unknown"
261class Page(PdfPage): 262 263 def __init__(self, document, index: int): 264 super().__init__(document, index) 265 self._template = "black_white" 266 producer = self.pdf.metadata.get("Producer", "").lower() 267 if "acrobat" in producer: 268 pass # default 269 elif "antenna" in producer: 270 self._template = "blue_gray" 271 else: 272 LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") 273 274 if "blue_gray" in self._template: 275 self._areas = areas_blue_gray(self) 276 self._spacing = spacing_blue_gray(self) 277 self._colors = colors_blue_gray 278 self._line_size = linesize_blue_gray 279 elif "black_white" in self._template: 280 self._areas = areas_black_white(self) 281 self._spacing = spacing_black_white(self) 282 self._colors = colors_black_white 283 self._line_size = linesize_black_white 284 285 # Patches to detect the header cells correctly 286 if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or 287 (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))): 288 self._spacing["th"] = 0.1 289 if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or 290 (self.pdf.name == "RM0456-v2" and self.index in [2881]) or 291 (self.pdf.name == "RM0456-v3" and self.index in [2880]) or 292 (self.pdf.name == "RM0461-v4" and self.index in [1246])): 293 self._spacing["th"] = 0.5 294 if ((self.pdf.name == "RM0456-v2" and self.index in [3005])): 295 self._spacing["th"] = 0.52 296 297 def _text_in_area(self, name, check_length=True) -> str: 298 if name not in self._areas: return "" 299 text = "" 300 areas = self._areas[name] 301 if not isinstance(areas, list): areas = [areas] 302 for area in areas: 303 text += self.text_in_area(area) 304 if check_length: assert text 305 return text 306 307 @cached_property 308 def identifier(self) -> str: 309 return self._text_in_area("id", check_length=False) 310 311 @cached_property 312 def top(self) -> str: 313 if self.index == 0: 314 return "Cover" 315 return self._text_in_area("top", check_length=False) 316 317 def is_relevant(self) -> bool: 318 if any(c in self.top for c in {"Contents", "List of ", "Index"}): 319 return False 320 return True 321 322 def _charlines_filtered(self, area, predicate = None, rtol = None) -> list[CharLine]: 323 if rtol is None: rtol = self._spacing["sc"] 324 # Split all chars into lines based on rounded origin 325 origin_lines_y = defaultdict(list) 326 origin_lines_x = defaultdict(list) 327 for char in self.chars_in_area(area): 328 # Ignore all characters we don't want 329 if predicate is not None and not predicate(char): 330 continue 331 # Ignore Carriage Return characters and ® (superscript issues) 332 if char.unicode in {0xd, ord("®")}: 333 continue 334 # Correct some weird unicode stuffing choices 335 if char.unicode in {2}: 336 char.unicode = ord("-") 337 if char.unicode in {61623, 61664}: 338 char.unicode = ord("•") 339 if char.unicode < 32 and char.unicode not in {0xa}: 340 continue 341 # Ignore characters without width that are not spaces 342 if not char.width and char.unicode not in {0xa, 0xd, 0x20}: 343 LOGGER.error(f"Unknown char width for {char}: {char.bbox}") 344 # Split up the chars depending on the orientation 345 if 45 < char.rotation <= 135 or 225 < char.rotation <= 315: 346 origin_lines_x[round(char.origin.x, 1)].append(char) 347 elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation: 348 origin_lines_y[round(char.origin.y, 1)].append(char) 349 else: 350 LOGGER.error("Unknown char rotation:", char, char.rotation) 351 352 # Convert characters into lines 353 bbox_lines_y = [] 354 for chars in origin_lines_y.values(): 355 # Remove lines with whitespace only 356 if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): 357 continue 358 origin = statistics.fmean(c.origin.y for c in chars) 359 line = CharLine(self, chars, 360 min(c.bbox.bottom for c in chars), 361 origin, 362 max(c.bbox.top for c in chars), 363 max(c.height for c in chars), 364 sort_origin=self.height - origin) 365 bbox_lines_y.append(line) 366 # print(line, line.top, line.origin, line.bottom, line.height) 367 bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin) 368 369 bbox_lines_x = [] 370 for chars in origin_lines_x.values(): 371 # Remove lines with whitespace only 372 if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): 373 continue 374 line = CharLine(self, chars, 375 min(c.bbox.left for c in chars), 376 statistics.fmean(c.origin.x for c in chars), 377 max(c.bbox.right for c in chars), 378 max(c.width for c in chars), 379 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90) 380 bbox_lines_x.append(line) 381 bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin) 382 383 if not bbox_lines: 384 return [] 385 386 # Merge lines that have overlapping bbox_lines 387 # FIXME: This merges lines that "collide" vertically like in formulas 388 merged_lines = [] 389 current_line = bbox_lines[0] 390 for next_line in bbox_lines[1:]: 391 height = max(current_line.height, next_line.height) 392 # Calculate overlap via normalize origin (increasing with line index) 393 if ((current_line._sort_origin + rtol * height) > 394 (next_line._sort_origin - rtol * height)): 395 # if line.rotation or self.rotation: 396 # # The next line overlaps this one, we merge the shorter line 397 # # (typically super- and subscript) into taller line 398 # use_current = len(current_line.chars) >= len(next_line.chars) 399 # else: 400 use_current = current_line.height >= next_line.height 401 line = current_line if use_current else next_line 402 current_line = CharLine(self, current_line.chars + next_line.chars, 403 line.bottom, line.origin, line.top, 404 height, line.rotation, 405 sort_origin=line._sort_origin) 406 else: 407 # The next line does not overlap the current line 408 merged_lines.append(current_line) 409 current_line = next_line 410 # append last line 411 merged_lines.append(current_line) 412 413 # Sort all lines horizontally based on character origin 414 sorted_lines = [] 415 for line in merged_lines: 416 if line.rotation == 90: 417 def sort_key(char): 418 if char.unicode in {0xa, 0xd}: 419 return char.tbbox.midpoint.y - 1e9 420 return char.tbbox.midpoint.y 421 elif line.rotation == 270: 422 def sort_key(char): 423 if char.unicode in {0xa, 0xd}: 424 return -char.tbbox.midpoint.y + 1e9 425 return -char.tbbox.midpoint.y 426 else: 427 def sort_key(char): 428 if char.unicode in {0xa, 0xd}: 429 return char.origin.x + 1e9 430 return char.origin.x 431 sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key), 432 line.bottom, line.origin, 433 line.top, line.height, 434 line.rotation, area.left, 435 sort_origin=line._sort_origin)) 436 437 return sorted_lines 438 439 def _content_areas(self, area: Rectangle, with_graphics: bool = True) -> list: 440 if with_graphics: 441 graphics = self._graphics_filtered(area) 442 regions = [] 443 for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)): 444 gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox 445 for reg in regions: 446 if reg.overlaps(gbbox.bottom, gbbox.top): 447 # They overlap, so merge them 448 reg.v0 = min(reg.v0, gbbox.bottom) 449 reg.v1 = max(reg.v1, gbbox.top) 450 reg.objs.append(graphic) 451 break 452 else: 453 regions.append(Region(gbbox.bottom, gbbox.top, graphic)) 454 455 # print(regions) 456 areas = [] 457 ypos = area.top 458 for reg in regions: 459 if ypos - reg.v1 > self._spacing["y_em"]: 460 areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None)) 461 for obj in reg.objs: 462 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 463 areas.append((oarea, obj)) 464 ypos = reg.v0 465 areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None)) 466 else: 467 areas = [(area, None)] 468 return areas 469 470 def _objects_filtered(self, area: Rectangle, with_graphics: bool = True) -> list: 471 self._link_characters() 472 areas = self._content_areas(area, with_graphics) 473 objects = [] 474 for narea, obj in areas: 475 if obj is None: 476 objects += self._charlines_filtered(narea) 477 else: 478 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 479 predicate = lambda c: not obj.bbox.contains(c.origin) 480 lines = self._charlines_filtered(oarea, predicate) 481 # print(obj, oarea, lines, [line.content for line in lines]) 482 objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x))) 483 return objects 484 485 @property 486 def content_ast(self) -> list: 487 ast = [] 488 with_graphics = True 489 if "DS" in self.pdf.name: 490 # FIXME: Terrible hack to get the ordering information table fixed 491 # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable 492 order_page = next((item.page_index for item in self.pdf.toc if item.level == 0 and 493 re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1) 494 with_graphics = (order_page != self.index) 495 for area in self._areas["content"]: 496 ast.append(self._ast_filtered(area, with_graphics=with_graphics)) 497 # Add a page node to the first leaf to keep track of where a page starts 498 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 499 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 500 return ast 501 502 def _graphics_filtered(self, area) -> list: 503 # Find all graphic clusters in this area 504 em = self._spacing["y_em"] 505 large_area = area.offset_x(em/2) 506 graphic_clusters = self.graphic_clusters(lambda p: large_area.contains(p.bbox), em/2) 507 # for bbox, paths in raw_graphic_clusters: 508 # # Some docs have large DRAFT chars in the background 509 # if any(path.fill == 0xe6e6e6ff and path.stroke == 0xff for path in paths): 510 # continue 511 # graphic_clusters.append((bbox, paths)) 512 513 # Find the captions and group them by y origin to catch side-by-side figures 514 ycaptions = defaultdict(list) 515 for line in self._charlines_filtered(area, lambda c: "Bold" in c.font): 516 for cluster in line.clusters(): 517 for phrase in [r"Figure \d+\.", r"Table \d+\."]: 518 if re.match(phrase, cluster.content): 519 ycaptions[int(round(cluster.bbox.y / em))].append((phrase, cluster.chars)) 520 ycaptions = [ycaptions[k] for k in sorted(ycaptions.keys(), key=lambda y: -y)] 521 522 # Now associate these captions with the graphics bboxes 523 categories = [] 524 for captions in ycaptions: 525 width = area.width / len(captions) 526 for ii, (phrase, chars) in enumerate(sorted(captions, key=lambda c: c[1][0].origin.x)): 527 left, right = area.left + ii * width, area.left + (ii + 1) * width 528 bottom, top, height = chars[0].bbox.bottom, chars[0].bbox.top, chars[0].height 529 530 # Find the graphic associated with this caption 531 graphic = next(((b, p) for b, p in graphic_clusters 532 if b.bottom <= bottom and 533 left <= b.left and b.right <= right), None) 534 if graphic is None: 535 LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}") 536 continue 537 538 if self._template == "blue_gray": 539 # Search for all lines of the current caption with the same properties 540 cbbox = Rectangle(left, bottom, right, top) 541 cchars = self.chars_in_area(cbbox) 542 while True: 543 nbbox = Rectangle(left, max(graphic[0].top, cbbox.bottom - height), right, top) 544 nchars = self.chars_in_area(nbbox) 545 if len(cchars) >= len(nchars): 546 break 547 cbbox = nbbox 548 cchars = nchars 549 elif self._template == "black_white": 550 cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top) 551 552 otype = phrase.split(" ")[0].lower() 553 if "Figure" in phrase: 554 # Find all other graphics in the bounding box 555 gbbox = Rectangle(left, graphic[0].bottom, right, cbbox.bottom) 556 graphics = [] 557 for b, p in graphic_clusters: 558 if gbbox.overlaps(b): 559 graphics.append((b,p)) 560 for g in graphics: 561 graphic_clusters.remove(g) 562 gbbox = [cluster[0] for cluster in graphics] 563 gbbox = reduce(lambda r0, r1: r0.joined(r1), gbbox) 564 paths = [p for cluster in graphics for p in cluster[1]] 565 566 if self._template == "blue_gray": 567 # Search for characters below the graphics bbox, max 1 y_em 568 gbbox = Rectangle(left, gbbox.bottom, right, gbbox.bottom) 569 while True: 570 gbbox = Rectangle(left, gbbox.bottom - self._spacing["y_em"], right, gbbox.bottom) 571 if not self.chars_in_area(gbbox): 572 break 573 # Generate the new bounding box which includes the caption 574 gbbox = Rectangle(left, gbbox.bottom, right, cbbox.bottom) 575 elif "Table" in phrase: 576 graphic_clusters.remove(graphic) 577 gbbox, paths = graphic 578 if (self._template == "black_white" and 579 sum(1 for path in paths if path.count == 2) >= len(paths) / 2): 580 otype += "_lines" 581 categories.append((otype, cbbox, gbbox, paths)) 582 583 # Deal with the remaining graphic categories 584 for gbbox, paths in graphic_clusters: 585 if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]: 586 continue 587 if any(isinstance(p, Image) for p in paths): 588 category = "figure" 589 elif self._template == "blue_gray": 590 if all(self._colors(path.stroke) == "gray" or 591 self._colors(path.fill) == "darkblue" for path in paths): 592 category = "table" 593 else: 594 category = "figure" 595 elif self._template == "black_white": 596 # Some tables are rendered explicitly with filled rectangular 597 # shapes with others are implicitly rendered with stroked lines 598 stroked_table_lines = sum(1 for path in paths if path.count == 2) >= len(paths) / 2 599 is_table = stroked_table_lines or all( 600 [any(p.isclose(pp) for pp in path.bbox.points) 601 for p in path.points].count(True) >= len(path.points) * 2 / 3 602 for path in paths) 603 if (len(paths) > 1 and is_table): 604 category = "table" 605 if stroked_table_lines: 606 category += "_lines" 607 else: 608 category = "figure" 609 610 if "table" in category: 611 # Check if there are only numbers on top of the table 612 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, gbbox.top + self._spacing["y_em"]) 613 nchars = [c for c in self.chars_in_area(cbbox) if c.unicode not in {0x20, 0xa, 0xd}] 614 615 if nchars and sum(1 if c.char.isnumeric() else 0 for c in nchars) >= len(nchars) / 3: 616 # This is a register table with invisible top borders! 617 cbbox = Rectangle(gbbox.left, gbbox.top, gbbox.right, max(c.bbox.top for c in nchars)) 618 gbbox = Rectangle(gbbox.left, gbbox.bottom, gbbox.right, cbbox.top) 619 name = "register_" + category 620 else: 621 cbbox = None 622 name = category 623 categories.append((name, cbbox, gbbox, paths)) 624 else: 625 categories.append(("figure", None, gbbox, paths)) 626 627 # Convert the objects into specialized classes 628 categories.sort(key=lambda o: (-o[2].y, o[2].x)) 629 objects = [] 630 for otype, caption_bbox, graphics_bbox, graphics_paths in categories: 631 if "figure" in otype: 632 figure = Figure(self, graphics_bbox, caption_bbox, graphics_paths) 633 objects.append(figure) 634 elif "table" in otype: 635 xlines, ylines, yhlines = [], [], [] 636 for path in graphics_paths: 637 if self._template == "blue_gray" or "_lines" in otype: 638 if self._colors(path.stroke) == "gray" or "_lines" in otype: 639 # Intercell paths in gray 640 if len(path.lines) == 1: 641 line = path.lines[0] 642 if line.direction == line.Direction.VERTICAL: 643 xlines.append(line.specialize()) 644 elif line.direction == line.Direction.HORIZONTAL: 645 ylines.append(line.specialize()) 646 else: 647 LOGGER.warn(f"Line not vertical or horizontal: {line}") 648 else: 649 LOGGER.warn(f"Path too long: {path}") 650 elif self._colors(path.fill) == "darkblue": 651 # Add the bottom line of the dark blue header box as a very thick line 652 line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5) 653 yhlines.append(line) 654 655 elif self._template == "black_white": 656 bbox = path.bbox 657 is_vertical = bbox.width < bbox.height 658 width = bbox.width if is_vertical else bbox.height 659 length = bbox.height if is_vertical else bbox.width 660 if width <= self._spacing["x_em"] / 2: 661 if length >= self._spacing["y_em"] / 2: 662 if is_vertical: 663 line = VLine(bbox.midpoint.x, bbox.bottom, bbox.top, bbox.width) 664 xlines.append(line) 665 else: 666 line = HLine(bbox.midpoint.y, bbox.left, bbox.right, bbox.height) 667 ylines.append(line) 668 else: 669 # Split the rectangle into it's outline 670 xlines.append(VLine(bbox.left, bbox.bottom, bbox.top, 0.1)) 671 xlines.append(VLine(bbox.right, bbox.bottom, bbox.top, 0.1)) 672 ylines.append(HLine(bbox.bottom, bbox.left, bbox.right, 0.1)) 673 ylines.append(HLine(bbox.top, bbox.left, bbox.right, 0.1)) 674 if yhlines: 675 yhlines.sort(key=lambda l: l.p0.y) 676 ylines.append(yhlines[0]) 677 if not xlines or not ylines: 678 continue 679 table = Table(self, graphics_bbox, xlines, ylines, caption_bbox, 680 is_register="register" in otype) 681 objects.append(table) 682 683 return objects 684 685 @property 686 def content_objects(self) -> list: 687 objs = [] 688 for area in self._areas["content"]: 689 objs.extend(self._objects_filtered(area)) 690 return objs 691 692 @property 693 def content_graphics(self) -> list: 694 objs = [] 695 for area in self._areas["content"]: 696 objs.extend(self._graphics_filtered(area)) 697 return objs 698 699 @property 700 def content_lines(self) -> list: 701 return [o for o in self.content_objects if isinstance(o, CharLine)] 702 703 @property 704 def content_tables(self) -> list: 705 return [o for o in self.content_graphics if isinstance(o, Table)] 706 707 @property 708 def content_figures(self) -> list: 709 return [o for o in self.content_graphics if isinstance(o, Figure)] 710 711 def _char_properties(self, line, char): 712 cp = { 713 "superscript": False, 714 "subscript": False, 715 "bold": any(frag in char.font for frag in {"Bold"}), 716 "italic": any(frag in char.font for frag in {"Italic", "Oblique"}), 717 "underline": (char.objlink or char.weblink) is not None, 718 "size": round(line.height), 719 "relsize": self._line_size(line), 720 "char": chr(char.unicode), 721 } 722 723 if line.rotation: 724 if char.origin.x < (line.origin - 0.25 * line.height): 725 cp["superscript"] = True 726 elif char.origin.x > (line.origin + 0.15 * line.height): 727 cp["subscript"] = True 728 elif char.origin.y > (line.origin + 0.25 * line.height): 729 cp["superscript"] = True 730 elif char.origin.y < (line.origin - 0.15 * line.height): 731 cp["subscript"] = True 732 733 return cp 734 735 def _ast_filtered(self, area: Rectangle, with_graphics=True, 736 ignore_xpos=False, with_bits=True, with_notes=True) -> list: 737 x_em = self._spacing["x_em"] 738 spacing_content = self._spacing["x_content"] 739 lh_factor = self._spacing["lh"] 740 # spacing_y = self._spacing["y_em"] 741 root = Node("area", obj=area, xpos=int(area.left), page=self) 742 743 def unindent(_xpos, _current, _newlines=1): 744 current = _current 745 # Check if we need to unindent the current node 746 while (_xpos - current.xpos) < -x_em and current.parent is not None and not ignore_xpos: 747 current = current.parent 748 if _newlines >= 2 and current.name == "para": 749 current = current.parent 750 return current 751 752 def parent_name(current): 753 return "" if current.parent is None else current.parent.name 754 755 current = root 756 ypos = area.top 757 for obj in self._objects_filtered(area, with_graphics): 758 xpos = round(obj.bbox.left) 759 # Tables should remain in their current hierarchy regardless of indentation 760 if isinstance(obj, (Table, Figure)): 761 current = next((c for c in current.iter_path_reverse() 762 if c.name.startswith("head")), root) 763 name = "figure" if isinstance(obj, Figure) else "table" 764 Node(name, parent=current, obj=obj, xpos=xpos, number=-1, 765 _width=obj.bbox.width / area.width, _type=obj._type) 766 ypos = obj.bbox.bottom 767 # Lines of text need to be carefully checked for indentation 768 elif isinstance(obj, CharLine): 769 newlines = round((ypos - obj.origin) / (lh_factor * obj.height)) 770 content = obj.content 771 lcontent = content.lstrip() 772 content_start = 0 773 linesize = self._line_size(obj) 774 775 # Check when the note has finished (=> paragraphs without italic) 776 if (parent_name(current) == "note" and 777 ((current.parent.type == "note" and not obj.contains_font(current.parent._font)) or 778 (current.parent.type in {"caution", "warning"} and newlines >= 2))): 779 current = current.parent.parent 780 781 # Check when the list ends into something indented far too right 782 elif (parent_name(current).startswith("list") 783 and (xpos - current.xpos) >= 2 * x_em): 784 current = current.parent.parent 785 786 # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content) 787 # Check if line is a heading, which may be multi-line, so we must 788 # be careful not to nest them, but group them properly 789 # Headings are always inserted into the root note! 790 if linesize.startswith("h1") or (linesize.startswith("h") and 791 xpos < (spacing_content + 2 * x_em) and "Bold" in obj.chars[0].font): 792 if (match := re.match(r"^ *(\d+(\.\d+)?(\.\d+)?) *", content)) is not None: 793 start = min(len(match.group(0)), len(obj.chars) - 1) 794 marker = match.group(1) 795 size = marker.count('.') + 2 796 else: 797 start = 0 798 marker = None 799 size = linesize[1] 800 name = f"head{size}" 801 # Check if we're already parsing a heading, do not split into two 802 if parent_name(current) != name or newlines > 2: 803 content_start = start 804 xpos = round(obj.chars[content_start].bbox.left) 805 current = Node(name, parent=root, obj=obj, xpos=xpos, 806 size=size, marker=marker) 807 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 808 809 # Check if the line is a note and deal with the indentation correctly 810 elif with_notes and (match := re.match(r" *([Nn]ote|[Cc]aution|[Ww]arning):? \d?", content)) is not None: 811 content_start = min(len(match.group(0)), len(obj.chars) - 1) 812 # print(obj.fonts) 813 # Correct xposition only if the Note: string is very far left 814 if xpos + 4 * x_em <= current.xpos: 815 xpos = round(obj.chars[content_start].bbox.left) 816 # Prevent nesting of notes, they should only be listed 817 if parent_name(current) == "note": 818 current = current.parent.parent 819 current = unindent(xpos, current, 2) 820 current = Node("note", parent=current, obj=obj, xpos=xpos, 821 type=match.group(1).lower(), _font=obj.chars[content_start].font) 822 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 823 824 # Check if line is Table or Figure caption 825 elif with_graphics and ((match := re.match(r" *([Tt]able|[Ff]igure) ?(\d+)\.? ?", content)) is not None 826 and "Bold" in obj.chars[0].font): 827 content_start = min(len(match.group(0)), len(obj.chars) - 1) 828 current = next((c for c in current.iter_path_reverse() 829 if c.name.startswith("head")), root) 830 current = Node("caption", parent=current, obj=obj, xpos=xpos, 831 _type=match.group(1).lower(), number=int(match.group(2))) 832 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 833 834 # Check if line is list and group them according to indentation 835 elif (match := re.match(r"^ *([•–]) ..|^ *(\d+)\. ..|^ *([a-z])\) ?..", content)) is not None: 836 current = unindent(xpos, current, newlines) 837 content_start = len(match.group(0)) - 2 838 xpos = round(obj.chars[content_start].bbox.left) 839 name = "listb" 840 value = lcontent[0] 841 if value in {"–", "-"}: name = "lists" 842 elif value.isalpha(): name = "lista" 843 elif value.isnumeric(): 844 name = "listn" 845 value = int(match.group(2)) 846 current = Node(name, parent=current, obj=obj, xpos=xpos, value=value) 847 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 848 849 # Check if line is a register bit definition 850 elif with_bits and re.match(r" *([Bb]ytes? *.+? *)?B[uio]ts? *\d+", content) is not None: 851 if obj.contains_font("Bold"): 852 # Use the bold character as delimiter 853 content_start = next(xi for xi, c in enumerate(obj.chars) if "Bold" in c.font) 854 else: 855 # Default back to the regex 856 if "Reserved" not in content: 857 LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}") 858 content_start = re.match(r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content) 859 if content_start is None: 860 LOGGER.error(f"Unable to match Bit regex at all! '{content}'!") 861 content_start = 0 862 else: 863 content_start = len(content_start.group(0)) 864 if not content_start: 865 LOGGER.error(f"Missing content start (=0)! '{content}'!") 866 content_start = min(content_start, len(obj.chars) - 1) 867 868 current = next((c for c in current.iter_path_reverse() 869 if c.name.startswith("head")), root) 870 middle = obj.chars[content_start].bbox.left 871 xpos = round(middle) 872 current = Node("bit", parent=current, obj=obj, xpos=xpos, _page=self, 873 _middle=middle, _left=area.left, _right=area.right) 874 current = Node("para", parent=current, obj=obj, xpos=current.xpos) 875 876 # Check if this is a new paragraph 877 elif newlines >= 2 or current.name not in {"para"}: 878 # Fix issues where notes are reflowing back left of Note: text 879 if parent_name(current) in {"note"}: 880 if xpos < current.parent.xpos: 881 xpos = current.parent.xpos 882 # Prevent multiline 883 current = unindent(xpos, current, newlines) 884 current = Node("para", parent=current, obj=obj, 885 xpos=xpos if current.is_root else current.xpos) 886 887 elif (parent_name(current) not in {"caption", "bit", "area"}): 888 current = unindent(xpos, current, newlines) 889 890 # Add the actual line 891 Node("line", parent=current, obj=obj, xpos=xpos, 892 start=content_start, str=content[content_start:50]) 893 894 ypos = obj.origin 895 896 return root 897 898 def __repr__(self) -> str: 899 return f"StPage({self.number})"
This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.
Page(document, index: int)
263 def __init__(self, document, index: int): 264 super().__init__(document, index) 265 self._template = "black_white" 266 producer = self.pdf.metadata.get("Producer", "").lower() 267 if "acrobat" in producer: 268 pass # default 269 elif "antenna" in producer: 270 self._template = "blue_gray" 271 else: 272 LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") 273 274 if "blue_gray" in self._template: 275 self._areas = areas_blue_gray(self) 276 self._spacing = spacing_blue_gray(self) 277 self._colors = colors_blue_gray 278 self._line_size = linesize_blue_gray 279 elif "black_white" in self._template: 280 self._areas = areas_black_white(self) 281 self._spacing = spacing_black_white(self) 282 self._colors = colors_black_white 283 self._line_size = linesize_black_white 284 285 # Patches to detect the header cells correctly 286 if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or 287 (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))): 288 self._spacing["th"] = 0.1 289 if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or 290 (self.pdf.name == "RM0456-v2" and self.index in [2881]) or 291 (self.pdf.name == "RM0456-v3" and self.index in [2880]) or 292 (self.pdf.name == "RM0461-v4" and self.index in [1246])): 293 self._spacing["th"] = 0.5 294 if ((self.pdf.name == "RM0456-v2" and self.index in [3005])): 295 self._spacing["th"] = 0.52
Parameters
- document: a PDF document.
- index: 0-index page number.
content_ast: list
485 @property 486 def content_ast(self) -> list: 487 ast = [] 488 with_graphics = True 489 if "DS" in self.pdf.name: 490 # FIXME: Terrible hack to get the ordering information table fixed 491 # Should be done in the AST as a rewrite similar to bit table rewrite with VirtualTable 492 order_page = next((item.page_index for item in self.pdf.toc if item.level == 0 and 493 re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1) 494 with_graphics = (order_page != self.index) 495 for area in self._areas["content"]: 496 ast.append(self._ast_filtered(area, with_graphics=with_graphics)) 497 # Add a page node to the first leaf to keep track of where a page starts 498 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 499 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 500 return ast
Inherited Members
- modm_data.pdf.page.Page
- index
- number
- label
- width
- height
- rotation
- bbox
- char_count
- char
- chars
- objlinks
- weblinks
- chars_in_area
- text_in_area
- structures
- find
- paths
- images
- graphic_clusters
- pypdfium2._helpers.page.PdfPage
- parent
- get_width
- get_height
- get_size
- get_rotation
- set_rotation
- get_mediabox
- set_mediabox
- get_cropbox
- set_cropbox
- get_bleedbox
- set_bleedbox
- get_trimbox
- set_trimbox
- get_artbox
- set_artbox
- get_bbox
- get_textpage
- insert_obj
- remove_obj
- gen_content
- get_objects
- render
- pypdfium2.internal.bases.AutoCloseable
- close