modm_data.pdf2html.page
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4import logging 5import statistics 6from typing import Callable 7from functools import cached_property 8from collections import defaultdict 9from .table import Table 10from .figure import Figure 11from .line import CharLine 12from ..utils import Rectangle, Region 13from ..pdf import Page as PdfPage, Character 14from anytree import Node 15 16 17_LOGGER = logging.getLogger(__name__) 18 19 20class Page(PdfPage): 21 def __init__(self, document, index: int): 22 super().__init__(document, index) 23 self._template = "default" 24 self.is_relevant: bool = True 25 """Is this page relevant for the conversion?""" 26 27 def _unicode_filter(self, code: int) -> int: 28 return code 29 30 @cached_property 31 def _spacing(self) -> dict[str, float]: 32 content = 0.1 33 return { 34 # Horizontal spacing: left->right 35 "x_em": 0.01 * self.width, 36 "x_left": content * self.width, 37 "x_right": (1 - content) * self.width, 38 "x_content": 0.2 * self.width, 39 # Vertical spacing: bottom->top 40 "y_em": 0.01 * self.height, 41 # Max table line thickness 42 "y_tline": 0.005 * self.height, 43 # Max line height distance to detect paragraphs 44 "lh": 0.9, 45 # Max line height distance to detect super-/subscript 46 "sc": 0.3, 47 # Table header cell bold text threshold 48 "th": 0.3, 49 } 50 51 def _line_size(self, line: CharLine) -> str: 52 rsize = line.height 53 if rsize >= 17.5: 54 return "h1" 55 elif rsize >= 15.5: 56 return "h2" 57 elif rsize >= 13.5: 58 return "h3" 59 elif rsize >= 11.4: 60 return "h4" 61 elif rsize >= 8.5: 62 return "n" 63 else: 64 return "fn" 65 66 def _colors(self, color: int) -> str: 67 if 0xFF <= color <= 0xFF: 68 return "black" 69 if 0xFFFFFFFF <= color <= 0xFFFFFFFF: 70 return "white" 71 return "unknown" 72 73 @cached_property 74 def _areas(self) -> dict[str, list[Rectangle] | Rectangle]: 75 content = Rectangle(0.1, 0.1, 0.9, 0.9) 76 areas = {"content": [content]} 77 scaled_areas = {} 78 79 def _s(r): 80 return Rectangle(r.left * self.width, r.bottom * self.height, r.right * self.width, r.top * self.height) 81 82 for name, area in areas.items(): 83 scaled_areas[name] = [_s(r) for r in area] if isinstance(area, list) else _s(area) 84 return scaled_areas 85 86 def _char_properties(self, line, char): 87 cp = { 88 "superscript": False, 89 "subscript": False, 90 "bold": any(frag in char.font for frag in {"Bold"}), 91 "italic": any(frag in char.font for frag in {"Italic", "Oblique"}), 92 "underline": (char.objlink or char.weblink) is not None, 93 "size": round(line.height), 94 "relsize": self._line_size(line), 95 "char": chr(char.unicode), 96 } 97 if line.rotation: 98 if char.origin.x < (line.origin - 0.25 * line.height): 99 cp["superscript"] = True 100 elif char.origin.x > (line.origin + 0.15 * line.height): 101 cp["subscript"] = True 102 elif char.origin.y > (line.origin + 0.25 * line.height): 103 cp["superscript"] = True 104 elif char.origin.y < (line.origin - 0.15 * line.height): 105 cp["subscript"] = True 106 return cp 107 108 def text_in_named_area(self, name: str, check_length: bool = True) -> str | None: 109 """ 110 Find all text in the named area. 111 112 :param name: the name of the area(s) to query. 113 :param check_length: assert that the text has a length. 114 :return: the concatenated text of the named area(s) or `None` if area not found. 115 """ 116 if name not in self._areas: 117 return None 118 text = "" 119 areas = self._areas[name] 120 if not isinstance(areas, list): 121 areas = [areas] 122 for area in areas: 123 text += self.text_in_area(area) 124 if check_length: 125 assert text 126 return text 127 128 def charlines_in_area( 129 self, area: Rectangle, predicate: Callable[[Character], bool] = None, rtol: float = None 130 ) -> list[CharLine]: 131 """ 132 Coalesce the characters in the area and predicate into lines. 133 134 1. Every character in the area is filtered by the `predicate`. 135 2. Character orientation is split into horizontal (left->right) and 136 vertical (bottom->top) character lines sorted by x or y position. 137 Lines containing only whitespace are discarded. 138 3. Overlapping character lines are merged into sub- and superscript 139 using `rtol * max(current_line.height, next_line.height)` as the 140 tolerance for checking if the lines overlap. 141 4. The characters in the merged lines are re-sorted by origin. 142 143 :param area: Area to search for characters. 144 :param predicate: Function to discard characters in the area or include all by default. 145 :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default. 146 :return: A list of character lines sorted by x or y position. 147 """ 148 if rtol is None: 149 rtol = self._spacing["sc"] 150 # Split all chars into lines based on rounded origin 151 origin_lines_y = defaultdict(list) 152 origin_lines_x = defaultdict(list) 153 for char in self.chars_in_area(area): 154 # Ignore all characters we don't want 155 if predicate is not None and not predicate(char): 156 continue 157 cunicode = self._unicode_filter(char.unicode) 158 if cunicode is None: 159 continue 160 char.unicode = cunicode 161 if char.unicode < 32 and char.unicode not in {0xA}: 162 continue 163 # Ignore characters without width that are not spaces 164 if not char.width and char.unicode not in {0xA, 0xD, 0x20}: 165 _LOGGER.error(f"Unknown char width for {char}: {char.bbox}") 166 # Split up the chars depending on the orientation 167 if 45 < char.rotation <= 135 or 225 < char.rotation <= 315: 168 origin_lines_x[round(char.origin.x, 1)].append(char) 169 elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation: 170 origin_lines_y[round(char.origin.y, 1)].append(char) 171 else: 172 _LOGGER.error("Unknown char rotation:", char, char.rotation) 173 174 # Convert characters into lines 175 bbox_lines_y = [] 176 for chars in origin_lines_y.values(): 177 # Remove lines with whitespace only 178 if all(c.unicode in {0xA, 0xD, 0x20} for c in chars): 179 continue 180 origin = statistics.fmean(c.origin.y for c in chars) 181 line = CharLine( 182 self, 183 chars, 184 min(c.bbox.bottom for c in chars), 185 origin, 186 max(c.bbox.top for c in chars), 187 max(c.height for c in chars), 188 sort_origin=self.height - origin, 189 ) 190 bbox_lines_y.append(line) 191 # print(line, line.top, line.origin, line.bottom, line.height) 192 bbox_lines = sorted(bbox_lines_y, key=lambda line: line._sort_origin) 193 194 bbox_lines_x = [] 195 for chars in origin_lines_x.values(): 196 # Remove lines with whitespace only 197 if all(c.unicode in {0xA, 0xD, 0x20} for c in chars): 198 continue 199 line = CharLine( 200 self, 201 chars, 202 min(c.bbox.left for c in chars), 203 statistics.fmean(c.origin.x for c in chars), 204 max(c.bbox.right for c in chars), 205 max(c.width for c in chars), 206 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90, 207 ) 208 bbox_lines_x.append(line) 209 bbox_lines += sorted(bbox_lines_x, key=lambda line: line._sort_origin) 210 211 if not bbox_lines: 212 return [] 213 214 # Merge lines that have overlapping bbox_lines 215 # FIXME: This merges lines that "collide" vertically like in formulas 216 merged_lines = [] 217 current_line = bbox_lines[0] 218 for next_line in bbox_lines[1:]: 219 height = max(current_line.height, next_line.height) 220 # Calculate overlap via normalize origin (increasing with line index) 221 if (current_line._sort_origin + rtol * height) > (next_line._sort_origin - rtol * height): 222 # if line.rotation or self.rotation: 223 # # The next line overlaps this one, we merge the shorter line 224 # # (typically super- and subscript) into taller line 225 # use_current = len(current_line.chars) >= len(next_line.chars) 226 # else: 227 use_current = current_line.height >= next_line.height 228 line = current_line if use_current else next_line 229 current_line = CharLine( 230 self, 231 current_line.chars + next_line.chars, 232 line.bottom, 233 line.origin, 234 line.top, 235 height, 236 line.rotation, 237 sort_origin=line._sort_origin, 238 ) 239 else: 240 # The next line does not overlap the current line 241 merged_lines.append(current_line) 242 current_line = next_line 243 # append last line 244 merged_lines.append(current_line) 245 246 # Sort all lines horizontally based on character origin 247 sorted_lines = [] 248 for line in merged_lines: 249 if line.rotation == 90: 250 251 def sort_key(char): 252 if char.unicode in {0xA, 0xD}: 253 return char.tbbox.midpoint.y - 1e9 254 return char.tbbox.midpoint.y 255 elif line.rotation == 270: 256 257 def sort_key(char): 258 if char.unicode in {0xA, 0xD}: 259 return -char.tbbox.midpoint.y + 1e9 260 return -char.tbbox.midpoint.y 261 else: 262 263 def sort_key(char): 264 if char.unicode in {0xA, 0xD}: 265 return char.origin.x + 1e9 266 return char.origin.x 267 268 sorted_lines.append( 269 CharLine( 270 self, 271 sorted(line.chars, key=sort_key), 272 line.bottom, 273 line.origin, 274 line.top, 275 line.height, 276 line.rotation, 277 area.left, 278 sort_origin=line._sort_origin, 279 ) 280 ) 281 282 return sorted_lines 283 284 def graphic_bboxes_in_area( 285 self, area: Rectangle, with_graphics: bool = True 286 ) -> list[tuple[Rectangle, Table | Figure | None]]: 287 """ 288 Coalesce the graphics in the area into full width bounding boxes. 289 290 1. Group vertically overlapping graphics. 291 2. Widen the overlapped graphics bounding boxes to the edges of the area. 292 293 :param area: area to search for content. 294 :param with_graphics: search for graphics in the area. 295 :return: list of tuples (bounding box, graphic objects or `None`). 296 """ 297 if with_graphics: 298 graphics = self.graphics_in_area(area) 299 regions = [] 300 # Check if graphics bounding boxes overlap vertically and group them 301 for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)): 302 gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox 303 for reg in regions: 304 if reg.overlaps(gbbox.bottom, gbbox.top): 305 # They overlap, so merge them 306 reg.v0 = min(reg.v0, gbbox.bottom) 307 reg.v1 = max(reg.v1, gbbox.top) 308 reg.objs.append(graphic) 309 break 310 else: 311 regions.append(Region(gbbox.bottom, gbbox.top, graphic)) 312 313 # print(regions) 314 # Coalesce all overlapped graphics objects into full width areas 315 areas = [] 316 ypos = area.top 317 for reg in regions: 318 if ypos - reg.v1 > self._spacing["y_em"]: 319 areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None)) 320 for obj in reg.objs: 321 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 322 areas.append((oarea, obj)) 323 ypos = reg.v0 324 areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None)) 325 else: 326 areas = [(area, None)] 327 return areas 328 329 def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]: 330 """ 331 Find all content objects in this area. 332 333 :param area: area to search for content. 334 :param with_graphics: search for graphics in the area. 335 :return: list of content objects sorted top to bottom. 336 """ 337 self._link_characters() 338 areas = self.graphic_bboxes_in_area(area, with_graphics) 339 objects = [] 340 for narea, obj in areas: 341 if obj is None: 342 objects += self.charlines_in_area(narea) 343 else: 344 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 345 346 def predicate(c): 347 return not obj.bbox.contains(c.origin) 348 349 lines = self.charlines_in_area(oarea, predicate) 350 # print(obj, oarea, lines, [line.content for line in lines]) 351 objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x))) 352 return objects 353 354 def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: 355 """ 356 Find all tables and figures in this area. 357 358 :param area: area to search for graphics. 359 :return: list of tables and figures. 360 """ 361 return [] 362 363 def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node: 364 """ 365 Convert the area content into an abstract syntax tree. 366 367 :param area: area to search for content. 368 :param with_graphics: including graphics in the area. 369 :return: An abstract syntax tree including the content formatting. 370 """ 371 return Node("area", obj=area, xpos=int(area.left), page=self) 372 373 @property 374 def content_ast(self) -> list[Node]: 375 """The abstract syntax trees in the content area.""" 376 ast = [] 377 with_graphics = True 378 for area in self._areas["content"]: 379 ast.append(self.ast_in_area(area, with_graphics=with_graphics)) 380 # Add a page node to the first leaf to keep track of where a page starts 381 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 382 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 383 return ast 384 385 @property 386 def content_objects(self) -> list[CharLine | Table | Figure]: 387 """All objects in the content areas.""" 388 objs = [] 389 for area in self._areas["content"]: 390 objs.extend(self.objects_in_area(area)) 391 return objs 392 393 @property 394 def content_graphics(self) -> list[Table | Figure]: 395 """All graphics in the content areas.""" 396 objs = [] 397 for area in self._areas["content"]: 398 objs.extend(self.graphics_in_area(area)) 399 return objs 400 401 @property 402 def content_lines(self) -> list[CharLine]: 403 """All lines in the content areas.""" 404 objs = [] 405 for area in self._areas["content"]: 406 objs.extend(self.charlines_in_area(area)) 407 return objs 408 409 @property 410 def content_tables(self) -> list[Table]: 411 """All tables in the content areas.""" 412 return [o for o in self.content_graphics if isinstance(o, Table)] 413 414 @property 415 def content_figures(self) -> list[Figure]: 416 """All figures in the content areas.""" 417 return [o for o in self.content_graphics if isinstance(o, Figure)] 418 419 def __repr__(self) -> str: 420 return f"Page({self.number})"
21class Page(PdfPage): 22 def __init__(self, document, index: int): 23 super().__init__(document, index) 24 self._template = "default" 25 self.is_relevant: bool = True 26 """Is this page relevant for the conversion?""" 27 28 def _unicode_filter(self, code: int) -> int: 29 return code 30 31 @cached_property 32 def _spacing(self) -> dict[str, float]: 33 content = 0.1 34 return { 35 # Horizontal spacing: left->right 36 "x_em": 0.01 * self.width, 37 "x_left": content * self.width, 38 "x_right": (1 - content) * self.width, 39 "x_content": 0.2 * self.width, 40 # Vertical spacing: bottom->top 41 "y_em": 0.01 * self.height, 42 # Max table line thickness 43 "y_tline": 0.005 * self.height, 44 # Max line height distance to detect paragraphs 45 "lh": 0.9, 46 # Max line height distance to detect super-/subscript 47 "sc": 0.3, 48 # Table header cell bold text threshold 49 "th": 0.3, 50 } 51 52 def _line_size(self, line: CharLine) -> str: 53 rsize = line.height 54 if rsize >= 17.5: 55 return "h1" 56 elif rsize >= 15.5: 57 return "h2" 58 elif rsize >= 13.5: 59 return "h3" 60 elif rsize >= 11.4: 61 return "h4" 62 elif rsize >= 8.5: 63 return "n" 64 else: 65 return "fn" 66 67 def _colors(self, color: int) -> str: 68 if 0xFF <= color <= 0xFF: 69 return "black" 70 if 0xFFFFFFFF <= color <= 0xFFFFFFFF: 71 return "white" 72 return "unknown" 73 74 @cached_property 75 def _areas(self) -> dict[str, list[Rectangle] | Rectangle]: 76 content = Rectangle(0.1, 0.1, 0.9, 0.9) 77 areas = {"content": [content]} 78 scaled_areas = {} 79 80 def _s(r): 81 return Rectangle(r.left * self.width, r.bottom * self.height, r.right * self.width, r.top * self.height) 82 83 for name, area in areas.items(): 84 scaled_areas[name] = [_s(r) for r in area] if isinstance(area, list) else _s(area) 85 return scaled_areas 86 87 def _char_properties(self, line, char): 88 cp = { 89 "superscript": False, 90 "subscript": False, 91 "bold": any(frag in char.font for frag in {"Bold"}), 92 "italic": any(frag in char.font for frag in {"Italic", "Oblique"}), 93 "underline": (char.objlink or char.weblink) is not None, 94 "size": round(line.height), 95 "relsize": self._line_size(line), 96 "char": chr(char.unicode), 97 } 98 if line.rotation: 99 if char.origin.x < (line.origin - 0.25 * line.height): 100 cp["superscript"] = True 101 elif char.origin.x > (line.origin + 0.15 * line.height): 102 cp["subscript"] = True 103 elif char.origin.y > (line.origin + 0.25 * line.height): 104 cp["superscript"] = True 105 elif char.origin.y < (line.origin - 0.15 * line.height): 106 cp["subscript"] = True 107 return cp 108 109 def text_in_named_area(self, name: str, check_length: bool = True) -> str | None: 110 """ 111 Find all text in the named area. 112 113 :param name: the name of the area(s) to query. 114 :param check_length: assert that the text has a length. 115 :return: the concatenated text of the named area(s) or `None` if area not found. 116 """ 117 if name not in self._areas: 118 return None 119 text = "" 120 areas = self._areas[name] 121 if not isinstance(areas, list): 122 areas = [areas] 123 for area in areas: 124 text += self.text_in_area(area) 125 if check_length: 126 assert text 127 return text 128 129 def charlines_in_area( 130 self, area: Rectangle, predicate: Callable[[Character], bool] = None, rtol: float = None 131 ) -> list[CharLine]: 132 """ 133 Coalesce the characters in the area and predicate into lines. 134 135 1. Every character in the area is filtered by the `predicate`. 136 2. Character orientation is split into horizontal (left->right) and 137 vertical (bottom->top) character lines sorted by x or y position. 138 Lines containing only whitespace are discarded. 139 3. Overlapping character lines are merged into sub- and superscript 140 using `rtol * max(current_line.height, next_line.height)` as the 141 tolerance for checking if the lines overlap. 142 4. The characters in the merged lines are re-sorted by origin. 143 144 :param area: Area to search for characters. 145 :param predicate: Function to discard characters in the area or include all by default. 146 :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default. 147 :return: A list of character lines sorted by x or y position. 148 """ 149 if rtol is None: 150 rtol = self._spacing["sc"] 151 # Split all chars into lines based on rounded origin 152 origin_lines_y = defaultdict(list) 153 origin_lines_x = defaultdict(list) 154 for char in self.chars_in_area(area): 155 # Ignore all characters we don't want 156 if predicate is not None and not predicate(char): 157 continue 158 cunicode = self._unicode_filter(char.unicode) 159 if cunicode is None: 160 continue 161 char.unicode = cunicode 162 if char.unicode < 32 and char.unicode not in {0xA}: 163 continue 164 # Ignore characters without width that are not spaces 165 if not char.width and char.unicode not in {0xA, 0xD, 0x20}: 166 _LOGGER.error(f"Unknown char width for {char}: {char.bbox}") 167 # Split up the chars depending on the orientation 168 if 45 < char.rotation <= 135 or 225 < char.rotation <= 315: 169 origin_lines_x[round(char.origin.x, 1)].append(char) 170 elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation: 171 origin_lines_y[round(char.origin.y, 1)].append(char) 172 else: 173 _LOGGER.error("Unknown char rotation:", char, char.rotation) 174 175 # Convert characters into lines 176 bbox_lines_y = [] 177 for chars in origin_lines_y.values(): 178 # Remove lines with whitespace only 179 if all(c.unicode in {0xA, 0xD, 0x20} for c in chars): 180 continue 181 origin = statistics.fmean(c.origin.y for c in chars) 182 line = CharLine( 183 self, 184 chars, 185 min(c.bbox.bottom for c in chars), 186 origin, 187 max(c.bbox.top for c in chars), 188 max(c.height for c in chars), 189 sort_origin=self.height - origin, 190 ) 191 bbox_lines_y.append(line) 192 # print(line, line.top, line.origin, line.bottom, line.height) 193 bbox_lines = sorted(bbox_lines_y, key=lambda line: line._sort_origin) 194 195 bbox_lines_x = [] 196 for chars in origin_lines_x.values(): 197 # Remove lines with whitespace only 198 if all(c.unicode in {0xA, 0xD, 0x20} for c in chars): 199 continue 200 line = CharLine( 201 self, 202 chars, 203 min(c.bbox.left for c in chars), 204 statistics.fmean(c.origin.x for c in chars), 205 max(c.bbox.right for c in chars), 206 max(c.width for c in chars), 207 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90, 208 ) 209 bbox_lines_x.append(line) 210 bbox_lines += sorted(bbox_lines_x, key=lambda line: line._sort_origin) 211 212 if not bbox_lines: 213 return [] 214 215 # Merge lines that have overlapping bbox_lines 216 # FIXME: This merges lines that "collide" vertically like in formulas 217 merged_lines = [] 218 current_line = bbox_lines[0] 219 for next_line in bbox_lines[1:]: 220 height = max(current_line.height, next_line.height) 221 # Calculate overlap via normalize origin (increasing with line index) 222 if (current_line._sort_origin + rtol * height) > (next_line._sort_origin - rtol * height): 223 # if line.rotation or self.rotation: 224 # # The next line overlaps this one, we merge the shorter line 225 # # (typically super- and subscript) into taller line 226 # use_current = len(current_line.chars) >= len(next_line.chars) 227 # else: 228 use_current = current_line.height >= next_line.height 229 line = current_line if use_current else next_line 230 current_line = CharLine( 231 self, 232 current_line.chars + next_line.chars, 233 line.bottom, 234 line.origin, 235 line.top, 236 height, 237 line.rotation, 238 sort_origin=line._sort_origin, 239 ) 240 else: 241 # The next line does not overlap the current line 242 merged_lines.append(current_line) 243 current_line = next_line 244 # append last line 245 merged_lines.append(current_line) 246 247 # Sort all lines horizontally based on character origin 248 sorted_lines = [] 249 for line in merged_lines: 250 if line.rotation == 90: 251 252 def sort_key(char): 253 if char.unicode in {0xA, 0xD}: 254 return char.tbbox.midpoint.y - 1e9 255 return char.tbbox.midpoint.y 256 elif line.rotation == 270: 257 258 def sort_key(char): 259 if char.unicode in {0xA, 0xD}: 260 return -char.tbbox.midpoint.y + 1e9 261 return -char.tbbox.midpoint.y 262 else: 263 264 def sort_key(char): 265 if char.unicode in {0xA, 0xD}: 266 return char.origin.x + 1e9 267 return char.origin.x 268 269 sorted_lines.append( 270 CharLine( 271 self, 272 sorted(line.chars, key=sort_key), 273 line.bottom, 274 line.origin, 275 line.top, 276 line.height, 277 line.rotation, 278 area.left, 279 sort_origin=line._sort_origin, 280 ) 281 ) 282 283 return sorted_lines 284 285 def graphic_bboxes_in_area( 286 self, area: Rectangle, with_graphics: bool = True 287 ) -> list[tuple[Rectangle, Table | Figure | None]]: 288 """ 289 Coalesce the graphics in the area into full width bounding boxes. 290 291 1. Group vertically overlapping graphics. 292 2. Widen the overlapped graphics bounding boxes to the edges of the area. 293 294 :param area: area to search for content. 295 :param with_graphics: search for graphics in the area. 296 :return: list of tuples (bounding box, graphic objects or `None`). 297 """ 298 if with_graphics: 299 graphics = self.graphics_in_area(area) 300 regions = [] 301 # Check if graphics bounding boxes overlap vertically and group them 302 for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)): 303 gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox 304 for reg in regions: 305 if reg.overlaps(gbbox.bottom, gbbox.top): 306 # They overlap, so merge them 307 reg.v0 = min(reg.v0, gbbox.bottom) 308 reg.v1 = max(reg.v1, gbbox.top) 309 reg.objs.append(graphic) 310 break 311 else: 312 regions.append(Region(gbbox.bottom, gbbox.top, graphic)) 313 314 # print(regions) 315 # Coalesce all overlapped graphics objects into full width areas 316 areas = [] 317 ypos = area.top 318 for reg in regions: 319 if ypos - reg.v1 > self._spacing["y_em"]: 320 areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None)) 321 for obj in reg.objs: 322 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 323 areas.append((oarea, obj)) 324 ypos = reg.v0 325 areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None)) 326 else: 327 areas = [(area, None)] 328 return areas 329 330 def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]: 331 """ 332 Find all content objects in this area. 333 334 :param area: area to search for content. 335 :param with_graphics: search for graphics in the area. 336 :return: list of content objects sorted top to bottom. 337 """ 338 self._link_characters() 339 areas = self.graphic_bboxes_in_area(area, with_graphics) 340 objects = [] 341 for narea, obj in areas: 342 if obj is None: 343 objects += self.charlines_in_area(narea) 344 else: 345 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 346 347 def predicate(c): 348 return not obj.bbox.contains(c.origin) 349 350 lines = self.charlines_in_area(oarea, predicate) 351 # print(obj, oarea, lines, [line.content for line in lines]) 352 objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x))) 353 return objects 354 355 def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: 356 """ 357 Find all tables and figures in this area. 358 359 :param area: area to search for graphics. 360 :return: list of tables and figures. 361 """ 362 return [] 363 364 def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node: 365 """ 366 Convert the area content into an abstract syntax tree. 367 368 :param area: area to search for content. 369 :param with_graphics: including graphics in the area. 370 :return: An abstract syntax tree including the content formatting. 371 """ 372 return Node("area", obj=area, xpos=int(area.left), page=self) 373 374 @property 375 def content_ast(self) -> list[Node]: 376 """The abstract syntax trees in the content area.""" 377 ast = [] 378 with_graphics = True 379 for area in self._areas["content"]: 380 ast.append(self.ast_in_area(area, with_graphics=with_graphics)) 381 # Add a page node to the first leaf to keep track of where a page starts 382 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 383 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 384 return ast 385 386 @property 387 def content_objects(self) -> list[CharLine | Table | Figure]: 388 """All objects in the content areas.""" 389 objs = [] 390 for area in self._areas["content"]: 391 objs.extend(self.objects_in_area(area)) 392 return objs 393 394 @property 395 def content_graphics(self) -> list[Table | Figure]: 396 """All graphics in the content areas.""" 397 objs = [] 398 for area in self._areas["content"]: 399 objs.extend(self.graphics_in_area(area)) 400 return objs 401 402 @property 403 def content_lines(self) -> list[CharLine]: 404 """All lines in the content areas.""" 405 objs = [] 406 for area in self._areas["content"]: 407 objs.extend(self.charlines_in_area(area)) 408 return objs 409 410 @property 411 def content_tables(self) -> list[Table]: 412 """All tables in the content areas.""" 413 return [o for o in self.content_graphics if isinstance(o, Table)] 414 415 @property 416 def content_figures(self) -> list[Figure]: 417 """All figures in the content areas.""" 418 return [o for o in self.content_graphics if isinstance(o, Figure)] 419 420 def __repr__(self) -> str: 421 return f"Page({self.number})"
This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.
22 def __init__(self, document, index: int): 23 super().__init__(document, index) 24 self._template = "default" 25 self.is_relevant: bool = True 26 """Is this page relevant for the conversion?"""
Parameters
- document: a PDF document.
- index: 0-index page number.
109 def text_in_named_area(self, name: str, check_length: bool = True) -> str | None: 110 """ 111 Find all text in the named area. 112 113 :param name: the name of the area(s) to query. 114 :param check_length: assert that the text has a length. 115 :return: the concatenated text of the named area(s) or `None` if area not found. 116 """ 117 if name not in self._areas: 118 return None 119 text = "" 120 areas = self._areas[name] 121 if not isinstance(areas, list): 122 areas = [areas] 123 for area in areas: 124 text += self.text_in_area(area) 125 if check_length: 126 assert text 127 return text
Find all text in the named area.
Parameters
- name: the name of the area(s) to query.
- check_length: assert that the text has a length.
Returns
the concatenated text of the named area(s) or
None
if area not found.
129 def charlines_in_area( 130 self, area: Rectangle, predicate: Callable[[Character], bool] = None, rtol: float = None 131 ) -> list[CharLine]: 132 """ 133 Coalesce the characters in the area and predicate into lines. 134 135 1. Every character in the area is filtered by the `predicate`. 136 2. Character orientation is split into horizontal (left->right) and 137 vertical (bottom->top) character lines sorted by x or y position. 138 Lines containing only whitespace are discarded. 139 3. Overlapping character lines are merged into sub- and superscript 140 using `rtol * max(current_line.height, next_line.height)` as the 141 tolerance for checking if the lines overlap. 142 4. The characters in the merged lines are re-sorted by origin. 143 144 :param area: Area to search for characters. 145 :param predicate: Function to discard characters in the area or include all by default. 146 :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default. 147 :return: A list of character lines sorted by x or y position. 148 """ 149 if rtol is None: 150 rtol = self._spacing["sc"] 151 # Split all chars into lines based on rounded origin 152 origin_lines_y = defaultdict(list) 153 origin_lines_x = defaultdict(list) 154 for char in self.chars_in_area(area): 155 # Ignore all characters we don't want 156 if predicate is not None and not predicate(char): 157 continue 158 cunicode = self._unicode_filter(char.unicode) 159 if cunicode is None: 160 continue 161 char.unicode = cunicode 162 if char.unicode < 32 and char.unicode not in {0xA}: 163 continue 164 # Ignore characters without width that are not spaces 165 if not char.width and char.unicode not in {0xA, 0xD, 0x20}: 166 _LOGGER.error(f"Unknown char width for {char}: {char.bbox}") 167 # Split up the chars depending on the orientation 168 if 45 < char.rotation <= 135 or 225 < char.rotation <= 315: 169 origin_lines_x[round(char.origin.x, 1)].append(char) 170 elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation: 171 origin_lines_y[round(char.origin.y, 1)].append(char) 172 else: 173 _LOGGER.error("Unknown char rotation:", char, char.rotation) 174 175 # Convert characters into lines 176 bbox_lines_y = [] 177 for chars in origin_lines_y.values(): 178 # Remove lines with whitespace only 179 if all(c.unicode in {0xA, 0xD, 0x20} for c in chars): 180 continue 181 origin = statistics.fmean(c.origin.y for c in chars) 182 line = CharLine( 183 self, 184 chars, 185 min(c.bbox.bottom for c in chars), 186 origin, 187 max(c.bbox.top for c in chars), 188 max(c.height for c in chars), 189 sort_origin=self.height - origin, 190 ) 191 bbox_lines_y.append(line) 192 # print(line, line.top, line.origin, line.bottom, line.height) 193 bbox_lines = sorted(bbox_lines_y, key=lambda line: line._sort_origin) 194 195 bbox_lines_x = [] 196 for chars in origin_lines_x.values(): 197 # Remove lines with whitespace only 198 if all(c.unicode in {0xA, 0xD, 0x20} for c in chars): 199 continue 200 line = CharLine( 201 self, 202 chars, 203 min(c.bbox.left for c in chars), 204 statistics.fmean(c.origin.x for c in chars), 205 max(c.bbox.right for c in chars), 206 max(c.width for c in chars), 207 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90, 208 ) 209 bbox_lines_x.append(line) 210 bbox_lines += sorted(bbox_lines_x, key=lambda line: line._sort_origin) 211 212 if not bbox_lines: 213 return [] 214 215 # Merge lines that have overlapping bbox_lines 216 # FIXME: This merges lines that "collide" vertically like in formulas 217 merged_lines = [] 218 current_line = bbox_lines[0] 219 for next_line in bbox_lines[1:]: 220 height = max(current_line.height, next_line.height) 221 # Calculate overlap via normalize origin (increasing with line index) 222 if (current_line._sort_origin + rtol * height) > (next_line._sort_origin - rtol * height): 223 # if line.rotation or self.rotation: 224 # # The next line overlaps this one, we merge the shorter line 225 # # (typically super- and subscript) into taller line 226 # use_current = len(current_line.chars) >= len(next_line.chars) 227 # else: 228 use_current = current_line.height >= next_line.height 229 line = current_line if use_current else next_line 230 current_line = CharLine( 231 self, 232 current_line.chars + next_line.chars, 233 line.bottom, 234 line.origin, 235 line.top, 236 height, 237 line.rotation, 238 sort_origin=line._sort_origin, 239 ) 240 else: 241 # The next line does not overlap the current line 242 merged_lines.append(current_line) 243 current_line = next_line 244 # append last line 245 merged_lines.append(current_line) 246 247 # Sort all lines horizontally based on character origin 248 sorted_lines = [] 249 for line in merged_lines: 250 if line.rotation == 90: 251 252 def sort_key(char): 253 if char.unicode in {0xA, 0xD}: 254 return char.tbbox.midpoint.y - 1e9 255 return char.tbbox.midpoint.y 256 elif line.rotation == 270: 257 258 def sort_key(char): 259 if char.unicode in {0xA, 0xD}: 260 return -char.tbbox.midpoint.y + 1e9 261 return -char.tbbox.midpoint.y 262 else: 263 264 def sort_key(char): 265 if char.unicode in {0xA, 0xD}: 266 return char.origin.x + 1e9 267 return char.origin.x 268 269 sorted_lines.append( 270 CharLine( 271 self, 272 sorted(line.chars, key=sort_key), 273 line.bottom, 274 line.origin, 275 line.top, 276 line.height, 277 line.rotation, 278 area.left, 279 sort_origin=line._sort_origin, 280 ) 281 ) 282 283 return sorted_lines
Coalesce the characters in the area and predicate into lines.
- Every character in the area is filtered by the
predicate
. - Character orientation is split into horizontal (left->right) and vertical (bottom->top) character lines sorted by x or y position. Lines containing only whitespace are discarded.
- Overlapping character lines are merged into sub- and superscript
using
rtol * max(current_line.height, next_line.height)
as the tolerance for checking if the lines overlap. - The characters in the merged lines are re-sorted by origin.
Parameters
- area: Area to search for characters.
- predicate: Function to discard characters in the area or include all by default.
- rtol: Relative tolerance to separate lines vertically or use
sc
spacing by default.
Returns
A list of character lines sorted by x or y position.
285 def graphic_bboxes_in_area( 286 self, area: Rectangle, with_graphics: bool = True 287 ) -> list[tuple[Rectangle, Table | Figure | None]]: 288 """ 289 Coalesce the graphics in the area into full width bounding boxes. 290 291 1. Group vertically overlapping graphics. 292 2. Widen the overlapped graphics bounding boxes to the edges of the area. 293 294 :param area: area to search for content. 295 :param with_graphics: search for graphics in the area. 296 :return: list of tuples (bounding box, graphic objects or `None`). 297 """ 298 if with_graphics: 299 graphics = self.graphics_in_area(area) 300 regions = [] 301 # Check if graphics bounding boxes overlap vertically and group them 302 for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)): 303 gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox 304 for reg in regions: 305 if reg.overlaps(gbbox.bottom, gbbox.top): 306 # They overlap, so merge them 307 reg.v0 = min(reg.v0, gbbox.bottom) 308 reg.v1 = max(reg.v1, gbbox.top) 309 reg.objs.append(graphic) 310 break 311 else: 312 regions.append(Region(gbbox.bottom, gbbox.top, graphic)) 313 314 # print(regions) 315 # Coalesce all overlapped graphics objects into full width areas 316 areas = [] 317 ypos = area.top 318 for reg in regions: 319 if ypos - reg.v1 > self._spacing["y_em"]: 320 areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None)) 321 for obj in reg.objs: 322 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 323 areas.append((oarea, obj)) 324 ypos = reg.v0 325 areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None)) 326 else: 327 areas = [(area, None)] 328 return areas
Coalesce the graphics in the area into full width bounding boxes.
- Group vertically overlapping graphics.
- Widen the overlapped graphics bounding boxes to the edges of the area.
Parameters
- area: area to search for content.
- with_graphics: search for graphics in the area.
Returns
list of tuples (bounding box, graphic objects or
None
).
330 def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]: 331 """ 332 Find all content objects in this area. 333 334 :param area: area to search for content. 335 :param with_graphics: search for graphics in the area. 336 :return: list of content objects sorted top to bottom. 337 """ 338 self._link_characters() 339 areas = self.graphic_bboxes_in_area(area, with_graphics) 340 objects = [] 341 for narea, obj in areas: 342 if obj is None: 343 objects += self.charlines_in_area(narea) 344 else: 345 oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox 346 347 def predicate(c): 348 return not obj.bbox.contains(c.origin) 349 350 lines = self.charlines_in_area(oarea, predicate) 351 # print(obj, oarea, lines, [line.content for line in lines]) 352 objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x))) 353 return objects
Find all content objects in this area.
Parameters
- area: area to search for content.
- with_graphics: search for graphics in the area.
Returns
list of content objects sorted top to bottom.
355 def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: 356 """ 357 Find all tables and figures in this area. 358 359 :param area: area to search for graphics. 360 :return: list of tables and figures. 361 """ 362 return []
Find all tables and figures in this area.
Parameters
- area: area to search for graphics.
Returns
list of tables and figures.
364 def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node: 365 """ 366 Convert the area content into an abstract syntax tree. 367 368 :param area: area to search for content. 369 :param with_graphics: including graphics in the area. 370 :return: An abstract syntax tree including the content formatting. 371 """ 372 return Node("area", obj=area, xpos=int(area.left), page=self)
Convert the area content into an abstract syntax tree.
Parameters
- area: area to search for content.
- with_graphics: including graphics in the area.
Returns
An abstract syntax tree including the content formatting.
374 @property 375 def content_ast(self) -> list[Node]: 376 """The abstract syntax trees in the content area.""" 377 ast = [] 378 with_graphics = True 379 for area in self._areas["content"]: 380 ast.append(self.ast_in_area(area, with_graphics=with_graphics)) 381 # Add a page node to the first leaf to keep track of where a page starts 382 first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) 383 Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) 384 return ast
The abstract syntax trees in the content area.
386 @property 387 def content_objects(self) -> list[CharLine | Table | Figure]: 388 """All objects in the content areas.""" 389 objs = [] 390 for area in self._areas["content"]: 391 objs.extend(self.objects_in_area(area)) 392 return objs
All objects in the content areas.
394 @property 395 def content_graphics(self) -> list[Table | Figure]: 396 """All graphics in the content areas.""" 397 objs = [] 398 for area in self._areas["content"]: 399 objs.extend(self.graphics_in_area(area)) 400 return objs
All graphics in the content areas.
402 @property 403 def content_lines(self) -> list[CharLine]: 404 """All lines in the content areas.""" 405 objs = [] 406 for area in self._areas["content"]: 407 objs.extend(self.charlines_in_area(area)) 408 return objs
All lines in the content areas.
410 @property 411 def content_tables(self) -> list[Table]: 412 """All tables in the content areas.""" 413 return [o for o in self.content_graphics if isinstance(o, Table)]
All tables in the content areas.
415 @property 416 def content_figures(self) -> list[Figure]: 417 """All figures in the content areas.""" 418 return [o for o in self.content_graphics if isinstance(o, Figure)]
All figures in the content areas.
Inherited Members
- modm_data.pdf.page.Page
- index
- number
- label
- width
- height
- rotation
- bbox
- char_count
- char
- chars
- objlinks
- weblinks
- chars_in_area
- text_in_area
- structures
- find
- paths
- images
- graphic_clusters
- pypdfium2._helpers.page.PdfPage
- parent
- get_width
- get_height
- get_size
- get_rotation
- set_rotation
- get_mediabox
- set_mediabox
- get_cropbox
- set_cropbox
- get_bleedbox
- set_bleedbox
- get_trimbox
- set_trimbox
- get_artbox
- set_artbox
- get_bbox
- get_textpage
- insert_obj
- remove_obj
- gen_content
- get_objects
- render
- pypdfium2.internal.bases.AutoCloseable
- close