modm_data.pdf2html.stmicro.table
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4import logging 5import statistics 6from functools import cached_property 7from collections import defaultdict 8from ...utils import HLine, VLine, Rectangle 9 10LOGGER = logging.getLogger(__name__) 11 12 13class TableCell: 14 class Borders: 15 def __init__(self, l, b, r, t): 16 self.l = l 17 self.b = b 18 self.r = r 19 self.t = t 20 21 def __init__(self, table, position, bbox, borders, is_simple=False): 22 self._table = table 23 self._bboxes = [bbox] 24 self.b = borders 25 self.positions = [position] 26 self.is_header = False 27 self._is_simple = is_simple 28 self._bbox = None 29 self._lines = None 30 31 def _merge(self, other): 32 self.positions.extend(other.positions) 33 self.positions.sort() 34 self._bboxes.append(other.bbox) 35 self._bbox = None 36 self._lines = None 37 38 def _move(self, x, y): 39 self.positions = [(py + y, px + x) for (py, px) in self.positions] 40 self.positions.sort() 41 42 def _expand(self, dx, dy): 43 ymax, xmax = self.positions[-1] 44 for yi in range(ymax, ymax + dy + 1): 45 for xi in range(xmax, xmax + dx + 1): 46 self.positions.append((yi, xi)) 47 self.positions.sort() 48 49 @property 50 def x(self) -> int: 51 return self.positions[0][1] 52 53 @property 54 def y(self) -> int: 55 return self.positions[0][0] 56 57 @property 58 def xspan(self) -> int: 59 return self.positions[-1][1] - self.positions[0][1] + 1 60 61 @property 62 def yspan(self) -> int: 63 return self.positions[-1][0] - self.positions[0][0] + 1 64 65 @property 66 def rotation(self) -> int: 67 if not self.lines: return 0 68 return self.lines[0].rotation 69 70 @property 71 def bbox(self) -> Rectangle: 72 if self._bbox is None: 73 self._bbox = Rectangle(min(bbox.left for bbox in self._bboxes), 74 min(bbox.bottom for bbox in self._bboxes), 75 max(bbox.right for bbox in self._bboxes), 76 max(bbox.top for bbox in self._bboxes)) 77 return self._bbox 78 79 @property 80 def lines(self): 81 if self._lines is None: 82 self._lines = self._table._page._charlines_filtered(self.bbox) 83 return self._lines 84 85 @property 86 def content(self): 87 return "".join(c.char for line in self.lines for c in line.chars) 88 89 @property 90 def left_aligned(self): 91 x_em = self._table._page._spacing["x_em"] 92 for line in self.lines: 93 if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right): 94 return True 95 return False 96 97 @property 98 def ast(self): 99 ast = self._table._page._ast_filtered(self.bbox, with_graphics=False, 100 ignore_xpos=not self.left_aligned, 101 with_bits=False, with_notes=False) 102 ast.name = "cell" 103 return ast 104 105 def __repr__(self) -> str: 106 positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions) 107 borders = "" 108 if self.b.l: borders += "[" 109 if self.b.b: borders += "_" 110 if self.b.t: borders += "^" 111 if self.b.r: borders += "]" 112 start = "CellH" if self.is_header else "Cell" 113 return start + f"[{positions}] {borders}" 114 115 116class Table: 117 def __init__(self, page, bbox: Rectangle, xlines: list, ylines: list, 118 cbbox: Rectangle = None, is_register: bool = False): 119 self._page = page 120 self._spacing = page._spacing 121 self.bbox = bbox 122 self.cbbox = None if is_register else cbbox 123 self._type = "table" 124 self._bit_headers = None 125 126 # Coalesce the vertical lines to detect the grid 127 def _cluster(lines, key): 128 atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4 129 grid = defaultdict(list) 130 last = -1e9 131 current = -1e9 132 for line in sorted(lines, key=key): 133 if (last + atol) < key(line): 134 current = key(line) 135 grid[current].append(line) 136 last = key(line) 137 return grid 138 xgrid = _cluster(xlines, lambda l: l.p0.x) 139 ygrid = _cluster(ylines, lambda l: l.p0.y) 140 141 if is_register: 142 self._type = "register" 143 144 # Find the positions of the top numbers 145 clusters = [] 146 if lines := self._page._charlines_filtered(cbbox): 147 if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)): 148 clusters.append((cluster, cbbox)) 149 else: 150 self.grid = (0, 0) 151 LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") 152 153 # Find the positions of the second row of numbers 154 if len(ygrid) > 2: 155 for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])): 156 nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, 157 self.bbox.right, ygrid[ypos1][0].p0.y) 158 if lines := self._page._charlines_filtered(nbbox): 159 if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars): 160 if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16: 161 clusters.append((cluster, nbbox)) 162 self._bit_headers = len(ygrid) - yi - 1 163 else: 164 self.grid = (len(cluster), 0) 165 LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})") 166 break 167 168 # Merge these clusters to find their positions 169 for cluster, bbox in clusters: 170 # Close left and right side 171 xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top)) 172 xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top)) 173 # Now close the lines in between 174 for cleft, cright in zip(cluster, cluster[1:]): 175 # find a line between the clusters 176 xpos = next(((x, xgrid[x][0].p0.x) for x in sorted(xgrid) 177 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left), None) 178 # Didn't find one, we must add one manually 179 if xpos is None: 180 xpos = (cleft.bbox.right + cright.bbox.left) / 2 181 xpos = (int(round(xpos)), xpos) 182 # Add it to the grid 183 xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top)) 184 # close the top 185 ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right)) 186 187 # Fix the position keys properly 188 self._xgrid = {int(round(statistics.fmean(m.p0.x for m in l))): l 189 for l in xgrid.values()} 190 self._ygrid = {int(round(statistics.fmean(m.p0.y for m in l))): l 191 for l in ygrid.values()} 192 # Map the positions to integers 193 self._xpos = list(sorted(self._xgrid)) 194 self._ypos = list(sorted(self._ygrid)) 195 196 self.grid = (len(self._xpos) - 1, len(self._ypos) - 1) 197 self._cells = None 198 199 def _cell_borders(self, x: int, y: int, bbox: Rectangle, 200 mask: int = 0b1111) -> tuple[int, int, int, int]: 201 # left, bottom, right, top 202 borders = [0, 0, 0, 0] 203 mp = bbox.midpoint 204 if mask & 0b1000: # Left 205 for line in self._xgrid[self._xpos[x]]: 206 if line.p0.y < mp.y < line.p1.y: 207 borders[0] = line.width 208 assert line.width 209 break 210 if mask & 0b0010: # Right 211 for line in self._xgrid[self._xpos[x + 1]]: 212 if line.p0.y < mp.y < line.p1.y: 213 borders[2] = line.width 214 assert line.width 215 break 216 if mask & 0b0100: # Bottom 217 for line in self._ygrid[self._ypos[y]]: 218 if line.p0.x < mp.x < line.p1.x: 219 borders[1] = line.width 220 assert line.width 221 break 222 if mask & 0b0001: # Top 223 for line in self._ygrid[self._ypos[y + 1]]: 224 if line.p0.x < mp.x < line.p1.x: 225 borders[3] = line.width 226 assert line.width 227 break 228 229 return TableCell.Borders(*borders) 230 231 def _fix_borders(self, cells, x: int, y: int): 232 # We are looking at the 9 neighbors around the cells 233 cell = cells[(x, y)] 234 c = cells[(x, y)].b 235 r = cells[(x + 1, y)].b if cells[(x + 1, y)] is not None else TableCell.Borders(0, 0, 1, 0) 236 t = cells[(x, y + 1)].b if cells[(x, y + 1)] is not None else TableCell.Borders(0, 1, 0, 0) 237 238 # if (not c.t and c.l and c.r and c.b) and "Reset value" in cell.content: 239 # c.t = 1 240 241 # Open at the top into a span 242 if (not c.t and c.r) and (not t.r or not t.l): 243 c.t = 1 244 t.b = 1 245 # Open at the top and self is a span 246 if (not c.t and not c.r) and (t.r and t.l): 247 c.t = 1 248 t.b = 1 249 250 # Open to the right into a span 251 if (not c.r and c.t) and (not r.t or not r.b): 252 c.r = 1 253 r.l = 1 254 # Open to the right and self is a span 255 if (not c.r and not c.t) and (r.t and r.b): 256 c.r = 1 257 r.l = 1 258 259 @property 260 def cells(self) -> list[TableCell]: 261 if self._cells is None: 262 if self.grid < (1, 1): 263 self._cells = [] 264 return self._cells 265 266 # First determine the spans of cells by checking the borders 267 cells = defaultdict(lambda: None) 268 for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])): 269 for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])): 270 bbox = Rectangle(x0, y0, x1, y1) 271 borders = self._cell_borders(xi, yi, bbox, 0b1111) 272 cells[(xi, yi)] = TableCell(self, (self.grid[1] - 1 - yi, xi), 273 bbox, borders, self._type == "register") 274 275 # Fix table cell borders via consistency checks 276 for yi in range(self.grid[1]): 277 for xi in range(self.grid[0]): 278 self._fix_borders(cells, xi, yi) 279 280 # Merge the cells recursively 281 def _merge(px, py, x, y): 282 if cells[(x, y)] is None: 283 return 284 # print(cells[(x, y)]) 285 # Right border is open 286 if not cells[(x, y)].b.r: 287 if cells[(x + 1, y)] is not None: 288 cells[(px, py)]._merge(cells[(x + 1, y)]) 289 _merge(px, py, x + 1, y) 290 cells[(x + 1, y)] = None 291 # Top border is open 292 if not cells[(x, y)].b.t: 293 if cells[(x, y + 1)] is not None: 294 cells[(px, py)]._merge(cells[(x, y + 1)]) 295 _merge(px, py, x, y + 1) 296 cells[(x, y + 1)] = None 297 # Start merging in bottom left cell 298 for yi in range(self.grid[1]): 299 for xi in range(self.grid[0]): 300 _merge(xi, yi, xi, yi) 301 302 # Find the header line, it is thicker than normal 303 y_header_pos = self.grid[1] 304 if self._type != "register": 305 if self.grid[1] > 1: 306 line_widths = {round(line.width, 1) for llist in self._ygrid.values() 307 for line in llist if line.width != 0.1} # magic width of virtual borders 308 if line_widths: 309 line_width_max = max(line_widths) * 0.9 310 if min(line_widths) < line_width_max: 311 # Find the first thick line starting from the top 312 y_header_pos = next((yi for yi, ypos in reversed(list(enumerate(self._ypos))) 313 if any(line.width > line_width_max for line in self._ygrid[ypos])), 314 y_header_pos) 315 316 # Map all the header 317 is_bold = [] 318 for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]): 319 bbox = None 320 for xi in range(self.grid[0]): 321 if (cell := cells[(xi, yi)]) is not None: 322 if bbox is None: 323 bbox = cell.bbox 324 else: 325 bbox = bbox.joined(cell.bbox) 326 if bbox is None: continue 327 chars = self._page.chars_in_area(bbox) 328 is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1 329 is_bold.append((yi, is_bold_pct > self._spacing["th"])) 330 331 # Some tables have no bold cells at all 332 if all(not b[1] for b in is_bold): 333 # Special case for two row tables without bold headers, but a bold line inbetween 334 if self.grid[1] == 2 and y_header_pos == 1: y_header_pos = 2 335 else: 336 if y_header_pos < self.grid[1]: 337 # Find the lowest bold row starting from bold line 338 y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos) 339 else: 340 # Find the lowest bold row starting from the top 341 for b in reversed(is_bold): 342 if not b[1]: break 343 y_header_pos = b[0] 344 345 # Tell the header cells 346 for yi in range(y_header_pos, self.grid[1]): 347 for xi in range(self.grid[0]): 348 if (cell := cells[(xi, yi)]) is not None: 349 cell.is_header = True 350 351 # Flatten into array 352 cells = [c for c in cells.values() if c is not None] 353 354 # Normalize cells for registers by moving the lower ones right and up 355 if self._type == "register" and self._bit_headers is not None: 356 for cell in cells: 357 if cell.y >= self._bit_headers: 358 cell._move(16, -self._bit_headers) 359 elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1: 360 cell._expand(0, 3 - self._bit_headers) 361 self.grid = (32, 4) 362 363 self._cells = list(sorted(cells, key=lambda c: c.positions[0])) 364 365 return self._cells 366 367 def append_bottom(self, other, merge_headers=True) -> bool: 368 debug = False 369 xgrid = self.grid[0] 370 if merge_headers and xgrid != other.grid[0]: 371 # Some tables have different column layouts due to span cells 372 # So we must correct the X positions of all cells accordingly 373 self_xheaders = defaultdict(set) 374 other_xheaders = defaultdict(set) 375 self_headers = [c for c in self.cells if c.is_header] 376 other_headers = [c for c in other.cells if c.is_header] 377 # Find the smallest set of spanning xpositions based on the header cells 378 for xpos in range(self.grid[0]): 379 for hcell in self_headers: 380 if any(p[1] == xpos for p in hcell.positions): 381 self_xheaders[hcell.x].add(xpos) 382 for xpos in range(other.grid[0]): 383 for hcell in other_headers: 384 if any(p[1] == xpos for p in hcell.positions): 385 other_xheaders[hcell.x].add(xpos) 386 387 # Compute the shared 388 self_heads = sorted(self_xheaders.keys()) 389 other_heads = sorted(other_xheaders.keys()) 390 xgrid = 0 391 merged_xheaders = defaultdict(set) 392 # Zip the groups together, these represent the matching header group spans 393 for self_xhead, other_xhead in zip(self_heads, other_heads): 394 size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead])) 395 merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size)) 396 xgrid += size 397 398 if debug: 399 print(len(self_xheaders), self_xheaders) 400 print(len(other_xheaders), other_xheaders) 401 print(len(merged_xheaders), merged_xheaders) 402 # If they are not equal length the table layouts are not compatible at all! 403 if len(self_heads) != len(other_heads): 404 LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") 405 return False 406 407 # We want to stuff/move the cell positions inplace, therefore we start 408 # backwards moving the high numbers even higher, so that we don't 409 # overwrite ourselves and get stuck in an infinite loop 410 # Zip the groups together, these represent the matching header group spans 411 for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)): 412 merged_xhead = max(self_xhead, other_xhead) 413 self_xpos = sorted(self_xheaders[self_xhead], reverse=True) 414 other_xpos = sorted(other_xheaders[other_xhead], reverse=True) 415 merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True) 416 417 def _insert_cells(cell, src, dsts, insert_only): 418 assert dsts 419 new_positions = [] 420 any_change = False 421 for cpos in reversed(cell.positions): 422 if insert_only: 423 # If our set is empty we must only insert positions 424 if cpos[1] == src: 425 for xpos in dsts: 426 if debug: 427 print(f"Insert {cpos}++{(cpos[0], xpos)}") 428 new_positions.append((cpos[0], xpos)) 429 any_change = True 430 new_positions.append(cpos) 431 else: 432 # We must move (=replace and add) the span positions 433 if cpos[1] == src: 434 if debug: 435 print(f"Move {cpos}->{(cpos[0], dsts[0])}") 436 new_positions.append((cpos[0], dsts[0])) 437 any_change = True 438 else: 439 new_positions.append(cpos) 440 if debug and any_change: 441 print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}") 442 print("old=", cell.positions, "new=", sorted(new_positions)) 443 print() 444 assert new_positions 445 assert len(new_positions) == len(set(new_positions)) 446 cell.positions = sorted(new_positions) 447 448 def _move_cells(cells, own_xpos): 449 if debug: 450 print() 451 print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======") 452 print() 453 454 for ii in range(max(len(own_xpos), len(merged_xpos))): 455 insert_only = ii >= len(own_xpos) 456 if insert_only: 457 src = merged_xpos[ii - 1] 458 dsts = merged_xpos[ii:] 459 if debug: print(f"{src}->{dsts} I") 460 for cell in cells: 461 _insert_cells(cell, src, dsts, True) 462 break 463 else: 464 src = own_xpos[ii] 465 dsts = merged_xpos[ii:ii + 1] 466 if debug: print(f"{src}->{dsts} M") 467 for cell in cells: 468 _insert_cells(cell, src, dsts, False) 469 470 if debug: print() 471 if self_xpos != merged_xpos: 472 if debug: 473 print(f"====== Self: x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}") 474 _move_cells(self.cells, self_xpos) 475 if other_xpos != merged_xpos: 476 if debug: 477 print(f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}") 478 _move_cells(other.cells, other_xpos) 479 if debug: 480 print() 481 print() 482 print() 483 484 # We must move the cells downwards now, but minus the header rows 485 rows = self.grid[1] - other.header_rows 486 for cell in other.cells: 487 # Discard the header cells, we just assume they are the same 488 if not cell.is_header: 489 cell._move(0, rows) 490 self.cells.append(cell) 491 self.cells.sort(key=lambda c: c.positions[0]) 492 self.grid = (xgrid, other.grid[1] + rows) 493 if debug: 494 print(f"{self._page} -> {self.grid}") 495 return True 496 497 def append_side(self, other, expand=False) -> bool: 498 if self.grid[1] != other.grid[1]: 499 if expand: 500 LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})") 501 ymin = min(self.grid[1], other.grid[1]) 502 ymax = max(self.grid[1], other.grid[1]) 503 etable = other if self.grid[1] > other.grid[1] else self 504 for cell in etable.cells: 505 if any(p[0] == ymin - 1 for p in cell.positions): 506 cell._expand(0, ymax - ymin) 507 etable.grid = (etable.grid[0], ymax) 508 else: 509 LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") 510 return False 511 512 # We must move all cells to the right now 513 columns = self.grid[0] 514 for cell in other.cells: 515 cell._move(columns, 0) 516 self.cells.append(cell) 517 self.cells.sort(key=lambda c: c.positions[0]) 518 self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1])) 519 return True 520 521 @cached_property 522 def header_rows(self) -> int: 523 header_cells = [c for c in self.cells if c.is_header] 524 if header_cells: 525 return max(c.positions[-1][0] + 1 for c in header_cells) 526 return 0 527 528 def __repr__(self) -> str: 529 return f"Table({self.grid[0]}x{self.grid[1]})" 530 531 532class VirtualTable(Table): 533 def __init__(self, page, bbox, cells, table_type=None): 534 self._page = page 535 self._spacing = page._spacing 536 self._type = table_type or "virtual" 537 self.bbox = bbox 538 self._cells = cells 539 self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1) 540 for cell in cells: 541 cell._table = self 542 543 def __repr__(self) -> str: 544 return f"VTable({self.grid[0]}x{self.grid[1]})"
LOGGER =
<Logger modm_data.pdf2html.stmicro.table (WARNING)>
class
TableCell:
14class TableCell: 15 class Borders: 16 def __init__(self, l, b, r, t): 17 self.l = l 18 self.b = b 19 self.r = r 20 self.t = t 21 22 def __init__(self, table, position, bbox, borders, is_simple=False): 23 self._table = table 24 self._bboxes = [bbox] 25 self.b = borders 26 self.positions = [position] 27 self.is_header = False 28 self._is_simple = is_simple 29 self._bbox = None 30 self._lines = None 31 32 def _merge(self, other): 33 self.positions.extend(other.positions) 34 self.positions.sort() 35 self._bboxes.append(other.bbox) 36 self._bbox = None 37 self._lines = None 38 39 def _move(self, x, y): 40 self.positions = [(py + y, px + x) for (py, px) in self.positions] 41 self.positions.sort() 42 43 def _expand(self, dx, dy): 44 ymax, xmax = self.positions[-1] 45 for yi in range(ymax, ymax + dy + 1): 46 for xi in range(xmax, xmax + dx + 1): 47 self.positions.append((yi, xi)) 48 self.positions.sort() 49 50 @property 51 def x(self) -> int: 52 return self.positions[0][1] 53 54 @property 55 def y(self) -> int: 56 return self.positions[0][0] 57 58 @property 59 def xspan(self) -> int: 60 return self.positions[-1][1] - self.positions[0][1] + 1 61 62 @property 63 def yspan(self) -> int: 64 return self.positions[-1][0] - self.positions[0][0] + 1 65 66 @property 67 def rotation(self) -> int: 68 if not self.lines: return 0 69 return self.lines[0].rotation 70 71 @property 72 def bbox(self) -> Rectangle: 73 if self._bbox is None: 74 self._bbox = Rectangle(min(bbox.left for bbox in self._bboxes), 75 min(bbox.bottom for bbox in self._bboxes), 76 max(bbox.right for bbox in self._bboxes), 77 max(bbox.top for bbox in self._bboxes)) 78 return self._bbox 79 80 @property 81 def lines(self): 82 if self._lines is None: 83 self._lines = self._table._page._charlines_filtered(self.bbox) 84 return self._lines 85 86 @property 87 def content(self): 88 return "".join(c.char for line in self.lines for c in line.chars) 89 90 @property 91 def left_aligned(self): 92 x_em = self._table._page._spacing["x_em"] 93 for line in self.lines: 94 if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right): 95 return True 96 return False 97 98 @property 99 def ast(self): 100 ast = self._table._page._ast_filtered(self.bbox, with_graphics=False, 101 ignore_xpos=not self.left_aligned, 102 with_bits=False, with_notes=False) 103 ast.name = "cell" 104 return ast 105 106 def __repr__(self) -> str: 107 positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions) 108 borders = "" 109 if self.b.l: borders += "[" 110 if self.b.b: borders += "_" 111 if self.b.t: borders += "^" 112 if self.b.r: borders += "]" 113 start = "CellH" if self.is_header else "Cell" 114 return start + f"[{positions}] {borders}"
class
TableCell.Borders:
15 class Borders: 16 def __init__(self, l, b, r, t): 17 self.l = l 18 self.b = b 19 self.r = r 20 self.t = t
class
Table:
117class Table: 118 def __init__(self, page, bbox: Rectangle, xlines: list, ylines: list, 119 cbbox: Rectangle = None, is_register: bool = False): 120 self._page = page 121 self._spacing = page._spacing 122 self.bbox = bbox 123 self.cbbox = None if is_register else cbbox 124 self._type = "table" 125 self._bit_headers = None 126 127 # Coalesce the vertical lines to detect the grid 128 def _cluster(lines, key): 129 atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4 130 grid = defaultdict(list) 131 last = -1e9 132 current = -1e9 133 for line in sorted(lines, key=key): 134 if (last + atol) < key(line): 135 current = key(line) 136 grid[current].append(line) 137 last = key(line) 138 return grid 139 xgrid = _cluster(xlines, lambda l: l.p0.x) 140 ygrid = _cluster(ylines, lambda l: l.p0.y) 141 142 if is_register: 143 self._type = "register" 144 145 # Find the positions of the top numbers 146 clusters = [] 147 if lines := self._page._charlines_filtered(cbbox): 148 if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)): 149 clusters.append((cluster, cbbox)) 150 else: 151 self.grid = (0, 0) 152 LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") 153 154 # Find the positions of the second row of numbers 155 if len(ygrid) > 2: 156 for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])): 157 nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, 158 self.bbox.right, ygrid[ypos1][0].p0.y) 159 if lines := self._page._charlines_filtered(nbbox): 160 if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars): 161 if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16: 162 clusters.append((cluster, nbbox)) 163 self._bit_headers = len(ygrid) - yi - 1 164 else: 165 self.grid = (len(cluster), 0) 166 LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})") 167 break 168 169 # Merge these clusters to find their positions 170 for cluster, bbox in clusters: 171 # Close left and right side 172 xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top)) 173 xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top)) 174 # Now close the lines in between 175 for cleft, cright in zip(cluster, cluster[1:]): 176 # find a line between the clusters 177 xpos = next(((x, xgrid[x][0].p0.x) for x in sorted(xgrid) 178 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left), None) 179 # Didn't find one, we must add one manually 180 if xpos is None: 181 xpos = (cleft.bbox.right + cright.bbox.left) / 2 182 xpos = (int(round(xpos)), xpos) 183 # Add it to the grid 184 xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top)) 185 # close the top 186 ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right)) 187 188 # Fix the position keys properly 189 self._xgrid = {int(round(statistics.fmean(m.p0.x for m in l))): l 190 for l in xgrid.values()} 191 self._ygrid = {int(round(statistics.fmean(m.p0.y for m in l))): l 192 for l in ygrid.values()} 193 # Map the positions to integers 194 self._xpos = list(sorted(self._xgrid)) 195 self._ypos = list(sorted(self._ygrid)) 196 197 self.grid = (len(self._xpos) - 1, len(self._ypos) - 1) 198 self._cells = None 199 200 def _cell_borders(self, x: int, y: int, bbox: Rectangle, 201 mask: int = 0b1111) -> tuple[int, int, int, int]: 202 # left, bottom, right, top 203 borders = [0, 0, 0, 0] 204 mp = bbox.midpoint 205 if mask & 0b1000: # Left 206 for line in self._xgrid[self._xpos[x]]: 207 if line.p0.y < mp.y < line.p1.y: 208 borders[0] = line.width 209 assert line.width 210 break 211 if mask & 0b0010: # Right 212 for line in self._xgrid[self._xpos[x + 1]]: 213 if line.p0.y < mp.y < line.p1.y: 214 borders[2] = line.width 215 assert line.width 216 break 217 if mask & 0b0100: # Bottom 218 for line in self._ygrid[self._ypos[y]]: 219 if line.p0.x < mp.x < line.p1.x: 220 borders[1] = line.width 221 assert line.width 222 break 223 if mask & 0b0001: # Top 224 for line in self._ygrid[self._ypos[y + 1]]: 225 if line.p0.x < mp.x < line.p1.x: 226 borders[3] = line.width 227 assert line.width 228 break 229 230 return TableCell.Borders(*borders) 231 232 def _fix_borders(self, cells, x: int, y: int): 233 # We are looking at the 9 neighbors around the cells 234 cell = cells[(x, y)] 235 c = cells[(x, y)].b 236 r = cells[(x + 1, y)].b if cells[(x + 1, y)] is not None else TableCell.Borders(0, 0, 1, 0) 237 t = cells[(x, y + 1)].b if cells[(x, y + 1)] is not None else TableCell.Borders(0, 1, 0, 0) 238 239 # if (not c.t and c.l and c.r and c.b) and "Reset value" in cell.content: 240 # c.t = 1 241 242 # Open at the top into a span 243 if (not c.t and c.r) and (not t.r or not t.l): 244 c.t = 1 245 t.b = 1 246 # Open at the top and self is a span 247 if (not c.t and not c.r) and (t.r and t.l): 248 c.t = 1 249 t.b = 1 250 251 # Open to the right into a span 252 if (not c.r and c.t) and (not r.t or not r.b): 253 c.r = 1 254 r.l = 1 255 # Open to the right and self is a span 256 if (not c.r and not c.t) and (r.t and r.b): 257 c.r = 1 258 r.l = 1 259 260 @property 261 def cells(self) -> list[TableCell]: 262 if self._cells is None: 263 if self.grid < (1, 1): 264 self._cells = [] 265 return self._cells 266 267 # First determine the spans of cells by checking the borders 268 cells = defaultdict(lambda: None) 269 for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])): 270 for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])): 271 bbox = Rectangle(x0, y0, x1, y1) 272 borders = self._cell_borders(xi, yi, bbox, 0b1111) 273 cells[(xi, yi)] = TableCell(self, (self.grid[1] - 1 - yi, xi), 274 bbox, borders, self._type == "register") 275 276 # Fix table cell borders via consistency checks 277 for yi in range(self.grid[1]): 278 for xi in range(self.grid[0]): 279 self._fix_borders(cells, xi, yi) 280 281 # Merge the cells recursively 282 def _merge(px, py, x, y): 283 if cells[(x, y)] is None: 284 return 285 # print(cells[(x, y)]) 286 # Right border is open 287 if not cells[(x, y)].b.r: 288 if cells[(x + 1, y)] is not None: 289 cells[(px, py)]._merge(cells[(x + 1, y)]) 290 _merge(px, py, x + 1, y) 291 cells[(x + 1, y)] = None 292 # Top border is open 293 if not cells[(x, y)].b.t: 294 if cells[(x, y + 1)] is not None: 295 cells[(px, py)]._merge(cells[(x, y + 1)]) 296 _merge(px, py, x, y + 1) 297 cells[(x, y + 1)] = None 298 # Start merging in bottom left cell 299 for yi in range(self.grid[1]): 300 for xi in range(self.grid[0]): 301 _merge(xi, yi, xi, yi) 302 303 # Find the header line, it is thicker than normal 304 y_header_pos = self.grid[1] 305 if self._type != "register": 306 if self.grid[1] > 1: 307 line_widths = {round(line.width, 1) for llist in self._ygrid.values() 308 for line in llist if line.width != 0.1} # magic width of virtual borders 309 if line_widths: 310 line_width_max = max(line_widths) * 0.9 311 if min(line_widths) < line_width_max: 312 # Find the first thick line starting from the top 313 y_header_pos = next((yi for yi, ypos in reversed(list(enumerate(self._ypos))) 314 if any(line.width > line_width_max for line in self._ygrid[ypos])), 315 y_header_pos) 316 317 # Map all the header 318 is_bold = [] 319 for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]): 320 bbox = None 321 for xi in range(self.grid[0]): 322 if (cell := cells[(xi, yi)]) is not None: 323 if bbox is None: 324 bbox = cell.bbox 325 else: 326 bbox = bbox.joined(cell.bbox) 327 if bbox is None: continue 328 chars = self._page.chars_in_area(bbox) 329 is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1 330 is_bold.append((yi, is_bold_pct > self._spacing["th"])) 331 332 # Some tables have no bold cells at all 333 if all(not b[1] for b in is_bold): 334 # Special case for two row tables without bold headers, but a bold line inbetween 335 if self.grid[1] == 2 and y_header_pos == 1: y_header_pos = 2 336 else: 337 if y_header_pos < self.grid[1]: 338 # Find the lowest bold row starting from bold line 339 y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos) 340 else: 341 # Find the lowest bold row starting from the top 342 for b in reversed(is_bold): 343 if not b[1]: break 344 y_header_pos = b[0] 345 346 # Tell the header cells 347 for yi in range(y_header_pos, self.grid[1]): 348 for xi in range(self.grid[0]): 349 if (cell := cells[(xi, yi)]) is not None: 350 cell.is_header = True 351 352 # Flatten into array 353 cells = [c for c in cells.values() if c is not None] 354 355 # Normalize cells for registers by moving the lower ones right and up 356 if self._type == "register" and self._bit_headers is not None: 357 for cell in cells: 358 if cell.y >= self._bit_headers: 359 cell._move(16, -self._bit_headers) 360 elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1: 361 cell._expand(0, 3 - self._bit_headers) 362 self.grid = (32, 4) 363 364 self._cells = list(sorted(cells, key=lambda c: c.positions[0])) 365 366 return self._cells 367 368 def append_bottom(self, other, merge_headers=True) -> bool: 369 debug = False 370 xgrid = self.grid[0] 371 if merge_headers and xgrid != other.grid[0]: 372 # Some tables have different column layouts due to span cells 373 # So we must correct the X positions of all cells accordingly 374 self_xheaders = defaultdict(set) 375 other_xheaders = defaultdict(set) 376 self_headers = [c for c in self.cells if c.is_header] 377 other_headers = [c for c in other.cells if c.is_header] 378 # Find the smallest set of spanning xpositions based on the header cells 379 for xpos in range(self.grid[0]): 380 for hcell in self_headers: 381 if any(p[1] == xpos for p in hcell.positions): 382 self_xheaders[hcell.x].add(xpos) 383 for xpos in range(other.grid[0]): 384 for hcell in other_headers: 385 if any(p[1] == xpos for p in hcell.positions): 386 other_xheaders[hcell.x].add(xpos) 387 388 # Compute the shared 389 self_heads = sorted(self_xheaders.keys()) 390 other_heads = sorted(other_xheaders.keys()) 391 xgrid = 0 392 merged_xheaders = defaultdict(set) 393 # Zip the groups together, these represent the matching header group spans 394 for self_xhead, other_xhead in zip(self_heads, other_heads): 395 size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead])) 396 merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size)) 397 xgrid += size 398 399 if debug: 400 print(len(self_xheaders), self_xheaders) 401 print(len(other_xheaders), other_xheaders) 402 print(len(merged_xheaders), merged_xheaders) 403 # If they are not equal length the table layouts are not compatible at all! 404 if len(self_heads) != len(other_heads): 405 LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") 406 return False 407 408 # We want to stuff/move the cell positions inplace, therefore we start 409 # backwards moving the high numbers even higher, so that we don't 410 # overwrite ourselves and get stuck in an infinite loop 411 # Zip the groups together, these represent the matching header group spans 412 for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)): 413 merged_xhead = max(self_xhead, other_xhead) 414 self_xpos = sorted(self_xheaders[self_xhead], reverse=True) 415 other_xpos = sorted(other_xheaders[other_xhead], reverse=True) 416 merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True) 417 418 def _insert_cells(cell, src, dsts, insert_only): 419 assert dsts 420 new_positions = [] 421 any_change = False 422 for cpos in reversed(cell.positions): 423 if insert_only: 424 # If our set is empty we must only insert positions 425 if cpos[1] == src: 426 for xpos in dsts: 427 if debug: 428 print(f"Insert {cpos}++{(cpos[0], xpos)}") 429 new_positions.append((cpos[0], xpos)) 430 any_change = True 431 new_positions.append(cpos) 432 else: 433 # We must move (=replace and add) the span positions 434 if cpos[1] == src: 435 if debug: 436 print(f"Move {cpos}->{(cpos[0], dsts[0])}") 437 new_positions.append((cpos[0], dsts[0])) 438 any_change = True 439 else: 440 new_positions.append(cpos) 441 if debug and any_change: 442 print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}") 443 print("old=", cell.positions, "new=", sorted(new_positions)) 444 print() 445 assert new_positions 446 assert len(new_positions) == len(set(new_positions)) 447 cell.positions = sorted(new_positions) 448 449 def _move_cells(cells, own_xpos): 450 if debug: 451 print() 452 print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======") 453 print() 454 455 for ii in range(max(len(own_xpos), len(merged_xpos))): 456 insert_only = ii >= len(own_xpos) 457 if insert_only: 458 src = merged_xpos[ii - 1] 459 dsts = merged_xpos[ii:] 460 if debug: print(f"{src}->{dsts} I") 461 for cell in cells: 462 _insert_cells(cell, src, dsts, True) 463 break 464 else: 465 src = own_xpos[ii] 466 dsts = merged_xpos[ii:ii + 1] 467 if debug: print(f"{src}->{dsts} M") 468 for cell in cells: 469 _insert_cells(cell, src, dsts, False) 470 471 if debug: print() 472 if self_xpos != merged_xpos: 473 if debug: 474 print(f"====== Self: x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}") 475 _move_cells(self.cells, self_xpos) 476 if other_xpos != merged_xpos: 477 if debug: 478 print(f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}") 479 _move_cells(other.cells, other_xpos) 480 if debug: 481 print() 482 print() 483 print() 484 485 # We must move the cells downwards now, but minus the header rows 486 rows = self.grid[1] - other.header_rows 487 for cell in other.cells: 488 # Discard the header cells, we just assume they are the same 489 if not cell.is_header: 490 cell._move(0, rows) 491 self.cells.append(cell) 492 self.cells.sort(key=lambda c: c.positions[0]) 493 self.grid = (xgrid, other.grid[1] + rows) 494 if debug: 495 print(f"{self._page} -> {self.grid}") 496 return True 497 498 def append_side(self, other, expand=False) -> bool: 499 if self.grid[1] != other.grid[1]: 500 if expand: 501 LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})") 502 ymin = min(self.grid[1], other.grid[1]) 503 ymax = max(self.grid[1], other.grid[1]) 504 etable = other if self.grid[1] > other.grid[1] else self 505 for cell in etable.cells: 506 if any(p[0] == ymin - 1 for p in cell.positions): 507 cell._expand(0, ymax - ymin) 508 etable.grid = (etable.grid[0], ymax) 509 else: 510 LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") 511 return False 512 513 # We must move all cells to the right now 514 columns = self.grid[0] 515 for cell in other.cells: 516 cell._move(columns, 0) 517 self.cells.append(cell) 518 self.cells.sort(key=lambda c: c.positions[0]) 519 self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1])) 520 return True 521 522 @cached_property 523 def header_rows(self) -> int: 524 header_cells = [c for c in self.cells if c.is_header] 525 if header_cells: 526 return max(c.positions[-1][0] + 1 for c in header_cells) 527 return 0 528 529 def __repr__(self) -> str: 530 return f"Table({self.grid[0]}x{self.grid[1]})"
Table( page, bbox: modm_data.utils.math.Rectangle, xlines: list, ylines: list, cbbox: modm_data.utils.math.Rectangle = None, is_register: bool = False)
118 def __init__(self, page, bbox: Rectangle, xlines: list, ylines: list, 119 cbbox: Rectangle = None, is_register: bool = False): 120 self._page = page 121 self._spacing = page._spacing 122 self.bbox = bbox 123 self.cbbox = None if is_register else cbbox 124 self._type = "table" 125 self._bit_headers = None 126 127 # Coalesce the vertical lines to detect the grid 128 def _cluster(lines, key): 129 atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4 130 grid = defaultdict(list) 131 last = -1e9 132 current = -1e9 133 for line in sorted(lines, key=key): 134 if (last + atol) < key(line): 135 current = key(line) 136 grid[current].append(line) 137 last = key(line) 138 return grid 139 xgrid = _cluster(xlines, lambda l: l.p0.x) 140 ygrid = _cluster(ylines, lambda l: l.p0.y) 141 142 if is_register: 143 self._type = "register" 144 145 # Find the positions of the top numbers 146 clusters = [] 147 if lines := self._page._charlines_filtered(cbbox): 148 if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)): 149 clusters.append((cluster, cbbox)) 150 else: 151 self.grid = (0, 0) 152 LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") 153 154 # Find the positions of the second row of numbers 155 if len(ygrid) > 2: 156 for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])): 157 nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, 158 self.bbox.right, ygrid[ypos1][0].p0.y) 159 if lines := self._page._charlines_filtered(nbbox): 160 if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars): 161 if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16: 162 clusters.append((cluster, nbbox)) 163 self._bit_headers = len(ygrid) - yi - 1 164 else: 165 self.grid = (len(cluster), 0) 166 LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})") 167 break 168 169 # Merge these clusters to find their positions 170 for cluster, bbox in clusters: 171 # Close left and right side 172 xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top)) 173 xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top)) 174 # Now close the lines in between 175 for cleft, cright in zip(cluster, cluster[1:]): 176 # find a line between the clusters 177 xpos = next(((x, xgrid[x][0].p0.x) for x in sorted(xgrid) 178 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left), None) 179 # Didn't find one, we must add one manually 180 if xpos is None: 181 xpos = (cleft.bbox.right + cright.bbox.left) / 2 182 xpos = (int(round(xpos)), xpos) 183 # Add it to the grid 184 xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top)) 185 # close the top 186 ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right)) 187 188 # Fix the position keys properly 189 self._xgrid = {int(round(statistics.fmean(m.p0.x for m in l))): l 190 for l in xgrid.values()} 191 self._ygrid = {int(round(statistics.fmean(m.p0.y for m in l))): l 192 for l in ygrid.values()} 193 # Map the positions to integers 194 self._xpos = list(sorted(self._xgrid)) 195 self._ypos = list(sorted(self._ygrid)) 196 197 self.grid = (len(self._xpos) - 1, len(self._ypos) - 1) 198 self._cells = None
cells: list[TableCell]
260 @property 261 def cells(self) -> list[TableCell]: 262 if self._cells is None: 263 if self.grid < (1, 1): 264 self._cells = [] 265 return self._cells 266 267 # First determine the spans of cells by checking the borders 268 cells = defaultdict(lambda: None) 269 for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])): 270 for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])): 271 bbox = Rectangle(x0, y0, x1, y1) 272 borders = self._cell_borders(xi, yi, bbox, 0b1111) 273 cells[(xi, yi)] = TableCell(self, (self.grid[1] - 1 - yi, xi), 274 bbox, borders, self._type == "register") 275 276 # Fix table cell borders via consistency checks 277 for yi in range(self.grid[1]): 278 for xi in range(self.grid[0]): 279 self._fix_borders(cells, xi, yi) 280 281 # Merge the cells recursively 282 def _merge(px, py, x, y): 283 if cells[(x, y)] is None: 284 return 285 # print(cells[(x, y)]) 286 # Right border is open 287 if not cells[(x, y)].b.r: 288 if cells[(x + 1, y)] is not None: 289 cells[(px, py)]._merge(cells[(x + 1, y)]) 290 _merge(px, py, x + 1, y) 291 cells[(x + 1, y)] = None 292 # Top border is open 293 if not cells[(x, y)].b.t: 294 if cells[(x, y + 1)] is not None: 295 cells[(px, py)]._merge(cells[(x, y + 1)]) 296 _merge(px, py, x, y + 1) 297 cells[(x, y + 1)] = None 298 # Start merging in bottom left cell 299 for yi in range(self.grid[1]): 300 for xi in range(self.grid[0]): 301 _merge(xi, yi, xi, yi) 302 303 # Find the header line, it is thicker than normal 304 y_header_pos = self.grid[1] 305 if self._type != "register": 306 if self.grid[1] > 1: 307 line_widths = {round(line.width, 1) for llist in self._ygrid.values() 308 for line in llist if line.width != 0.1} # magic width of virtual borders 309 if line_widths: 310 line_width_max = max(line_widths) * 0.9 311 if min(line_widths) < line_width_max: 312 # Find the first thick line starting from the top 313 y_header_pos = next((yi for yi, ypos in reversed(list(enumerate(self._ypos))) 314 if any(line.width > line_width_max for line in self._ygrid[ypos])), 315 y_header_pos) 316 317 # Map all the header 318 is_bold = [] 319 for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]): 320 bbox = None 321 for xi in range(self.grid[0]): 322 if (cell := cells[(xi, yi)]) is not None: 323 if bbox is None: 324 bbox = cell.bbox 325 else: 326 bbox = bbox.joined(cell.bbox) 327 if bbox is None: continue 328 chars = self._page.chars_in_area(bbox) 329 is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1 330 is_bold.append((yi, is_bold_pct > self._spacing["th"])) 331 332 # Some tables have no bold cells at all 333 if all(not b[1] for b in is_bold): 334 # Special case for two row tables without bold headers, but a bold line inbetween 335 if self.grid[1] == 2 and y_header_pos == 1: y_header_pos = 2 336 else: 337 if y_header_pos < self.grid[1]: 338 # Find the lowest bold row starting from bold line 339 y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos) 340 else: 341 # Find the lowest bold row starting from the top 342 for b in reversed(is_bold): 343 if not b[1]: break 344 y_header_pos = b[0] 345 346 # Tell the header cells 347 for yi in range(y_header_pos, self.grid[1]): 348 for xi in range(self.grid[0]): 349 if (cell := cells[(xi, yi)]) is not None: 350 cell.is_header = True 351 352 # Flatten into array 353 cells = [c for c in cells.values() if c is not None] 354 355 # Normalize cells for registers by moving the lower ones right and up 356 if self._type == "register" and self._bit_headers is not None: 357 for cell in cells: 358 if cell.y >= self._bit_headers: 359 cell._move(16, -self._bit_headers) 360 elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1: 361 cell._expand(0, 3 - self._bit_headers) 362 self.grid = (32, 4) 363 364 self._cells = list(sorted(cells, key=lambda c: c.positions[0])) 365 366 return self._cells
def
append_bottom(self, other, merge_headers=True) -> bool:
368 def append_bottom(self, other, merge_headers=True) -> bool: 369 debug = False 370 xgrid = self.grid[0] 371 if merge_headers and xgrid != other.grid[0]: 372 # Some tables have different column layouts due to span cells 373 # So we must correct the X positions of all cells accordingly 374 self_xheaders = defaultdict(set) 375 other_xheaders = defaultdict(set) 376 self_headers = [c for c in self.cells if c.is_header] 377 other_headers = [c for c in other.cells if c.is_header] 378 # Find the smallest set of spanning xpositions based on the header cells 379 for xpos in range(self.grid[0]): 380 for hcell in self_headers: 381 if any(p[1] == xpos for p in hcell.positions): 382 self_xheaders[hcell.x].add(xpos) 383 for xpos in range(other.grid[0]): 384 for hcell in other_headers: 385 if any(p[1] == xpos for p in hcell.positions): 386 other_xheaders[hcell.x].add(xpos) 387 388 # Compute the shared 389 self_heads = sorted(self_xheaders.keys()) 390 other_heads = sorted(other_xheaders.keys()) 391 xgrid = 0 392 merged_xheaders = defaultdict(set) 393 # Zip the groups together, these represent the matching header group spans 394 for self_xhead, other_xhead in zip(self_heads, other_heads): 395 size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead])) 396 merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size)) 397 xgrid += size 398 399 if debug: 400 print(len(self_xheaders), self_xheaders) 401 print(len(other_xheaders), other_xheaders) 402 print(len(merged_xheaders), merged_xheaders) 403 # If they are not equal length the table layouts are not compatible at all! 404 if len(self_heads) != len(other_heads): 405 LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") 406 return False 407 408 # We want to stuff/move the cell positions inplace, therefore we start 409 # backwards moving the high numbers even higher, so that we don't 410 # overwrite ourselves and get stuck in an infinite loop 411 # Zip the groups together, these represent the matching header group spans 412 for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)): 413 merged_xhead = max(self_xhead, other_xhead) 414 self_xpos = sorted(self_xheaders[self_xhead], reverse=True) 415 other_xpos = sorted(other_xheaders[other_xhead], reverse=True) 416 merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True) 417 418 def _insert_cells(cell, src, dsts, insert_only): 419 assert dsts 420 new_positions = [] 421 any_change = False 422 for cpos in reversed(cell.positions): 423 if insert_only: 424 # If our set is empty we must only insert positions 425 if cpos[1] == src: 426 for xpos in dsts: 427 if debug: 428 print(f"Insert {cpos}++{(cpos[0], xpos)}") 429 new_positions.append((cpos[0], xpos)) 430 any_change = True 431 new_positions.append(cpos) 432 else: 433 # We must move (=replace and add) the span positions 434 if cpos[1] == src: 435 if debug: 436 print(f"Move {cpos}->{(cpos[0], dsts[0])}") 437 new_positions.append((cpos[0], dsts[0])) 438 any_change = True 439 else: 440 new_positions.append(cpos) 441 if debug and any_change: 442 print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}") 443 print("old=", cell.positions, "new=", sorted(new_positions)) 444 print() 445 assert new_positions 446 assert len(new_positions) == len(set(new_positions)) 447 cell.positions = sorted(new_positions) 448 449 def _move_cells(cells, own_xpos): 450 if debug: 451 print() 452 print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======") 453 print() 454 455 for ii in range(max(len(own_xpos), len(merged_xpos))): 456 insert_only = ii >= len(own_xpos) 457 if insert_only: 458 src = merged_xpos[ii - 1] 459 dsts = merged_xpos[ii:] 460 if debug: print(f"{src}->{dsts} I") 461 for cell in cells: 462 _insert_cells(cell, src, dsts, True) 463 break 464 else: 465 src = own_xpos[ii] 466 dsts = merged_xpos[ii:ii + 1] 467 if debug: print(f"{src}->{dsts} M") 468 for cell in cells: 469 _insert_cells(cell, src, dsts, False) 470 471 if debug: print() 472 if self_xpos != merged_xpos: 473 if debug: 474 print(f"====== Self: x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}") 475 _move_cells(self.cells, self_xpos) 476 if other_xpos != merged_xpos: 477 if debug: 478 print(f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}") 479 _move_cells(other.cells, other_xpos) 480 if debug: 481 print() 482 print() 483 print() 484 485 # We must move the cells downwards now, but minus the header rows 486 rows = self.grid[1] - other.header_rows 487 for cell in other.cells: 488 # Discard the header cells, we just assume they are the same 489 if not cell.is_header: 490 cell._move(0, rows) 491 self.cells.append(cell) 492 self.cells.sort(key=lambda c: c.positions[0]) 493 self.grid = (xgrid, other.grid[1] + rows) 494 if debug: 495 print(f"{self._page} -> {self.grid}") 496 return True
def
append_side(self, other, expand=False) -> bool:
498 def append_side(self, other, expand=False) -> bool: 499 if self.grid[1] != other.grid[1]: 500 if expand: 501 LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})") 502 ymin = min(self.grid[1], other.grid[1]) 503 ymax = max(self.grid[1], other.grid[1]) 504 etable = other if self.grid[1] > other.grid[1] else self 505 for cell in etable.cells: 506 if any(p[0] == ymin - 1 for p in cell.positions): 507 cell._expand(0, ymax - ymin) 508 etable.grid = (etable.grid[0], ymax) 509 else: 510 LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") 511 return False 512 513 # We must move all cells to the right now 514 columns = self.grid[0] 515 for cell in other.cells: 516 cell._move(columns, 0) 517 self.cells.append(cell) 518 self.cells.sort(key=lambda c: c.positions[0]) 519 self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1])) 520 return True
533class VirtualTable(Table): 534 def __init__(self, page, bbox, cells, table_type=None): 535 self._page = page 536 self._spacing = page._spacing 537 self._type = table_type or "virtual" 538 self.bbox = bbox 539 self._cells = cells 540 self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1) 541 for cell in cells: 542 cell._table = self 543 544 def __repr__(self) -> str: 545 return f"VTable({self.grid[0]}x{self.grid[1]})"
VirtualTable(page, bbox, cells, table_type=None)
534 def __init__(self, page, bbox, cells, table_type=None): 535 self._page = page 536 self._spacing = page._spacing 537 self._type = table_type or "virtual" 538 self.bbox = bbox 539 self._cells = cells 540 self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1) 541 for cell in cells: 542 cell._table = self