modm_data.pdf2html.table
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4import logging 5import statistics 6from functools import cached_property 7from collections import defaultdict 8from ..utils import HLine, VLine, Rectangle 9from .cell import Cell, Borders 10 11_LOGGER = logging.getLogger(__name__) 12 13 14class Table: 15 def __init__( 16 self, page, bbox: Rectangle, xlines: list, ylines: list, cbbox: Rectangle = None, is_register: bool = False 17 ): 18 self._page = page 19 self._spacing = page._spacing 20 self.bbox = bbox 21 self.cbbox = None if is_register else cbbox 22 self._type = "table" 23 self._bit_headers = None 24 25 # Coalesce the vertical lines to detect the grid 26 def _cluster(lines, key): 27 atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4 28 grid = defaultdict(list) 29 last = -1e9 30 current = -1e9 31 for line in sorted(lines, key=key): 32 if (last + atol) < key(line): 33 current = key(line) 34 grid[current].append(line) 35 last = key(line) 36 return grid 37 38 xgrid = _cluster(xlines, lambda line: line.p0.x) 39 ygrid = _cluster(ylines, lambda line: line.p0.y) 40 41 if is_register: 42 self._type = "register" 43 44 # Find the positions of the top numbers 45 clusters = [] 46 if lines := self._page.charlines_in_area(cbbox): 47 if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)): 48 clusters.append((cluster, cbbox)) 49 else: 50 self.grid = (0, 0) 51 _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") 52 53 # Find the positions of the second row of numbers 54 if len(ygrid) > 2: 55 for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])): 56 nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, self.bbox.right, ygrid[ypos1][0].p0.y) 57 if lines := self._page.charlines_in_area(nbbox): 58 if all(c.char.isnumeric() or c.unicode in {0x20, 0xA, 0xD} for c in lines[0].chars): 59 if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16: 60 clusters.append((cluster, nbbox)) 61 self._bit_headers = len(ygrid) - yi - 1 62 else: 63 self.grid = (len(cluster), 0) 64 _LOGGER.warning( 65 f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})" 66 ) 67 break 68 69 # Merge these clusters to find their positions 70 for cluster, bbox in clusters: 71 # Close left and right side 72 xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top)) 73 xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top)) 74 # Now close the lines in between 75 for cleft, cright in zip(cluster, cluster[1:]): 76 # find a line between the clusters 77 xpos = next( 78 ( 79 (x, xgrid[x][0].p0.x) 80 for x in sorted(xgrid) 81 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left 82 ), 83 None, 84 ) 85 # Didn't find one, we must add one manually 86 if xpos is None: 87 xpos = (cleft.bbox.right + cright.bbox.left) / 2 88 xpos = (int(round(xpos)), xpos) 89 # Add it to the grid 90 xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top)) 91 # close the top 92 ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right)) 93 94 # Fix the position keys properly 95 self._xgrid = {int(round(statistics.fmean(m.p0.x for m in line))): line for line in xgrid.values()} 96 self._ygrid = {int(round(statistics.fmean(m.p0.y for m in line))): line for line in ygrid.values()} 97 # Map the positions to integers 98 self._xpos = list(sorted(self._xgrid)) 99 self._ypos = list(sorted(self._ygrid)) 100 101 self.grid = (len(self._xpos) - 1, len(self._ypos) - 1) 102 self._cells = None 103 104 def _cell_borders(self, x: int, y: int, bbox: Rectangle, mask: int = 0b1111) -> tuple[int, int, int, int]: 105 # left, bottom, right, top 106 borders = [0, 0, 0, 0] 107 mp = bbox.midpoint 108 if mask & 0b1000: # Left 109 for line in self._xgrid[self._xpos[x]]: 110 if line.p0.y < mp.y < line.p1.y: 111 borders[0] = line.width 112 assert line.width 113 break 114 if mask & 0b0010: # Right 115 for line in self._xgrid[self._xpos[x + 1]]: 116 if line.p0.y < mp.y < line.p1.y: 117 borders[2] = line.width 118 assert line.width 119 break 120 if mask & 0b0100: # Bottom 121 for line in self._ygrid[self._ypos[y]]: 122 if line.p0.x < mp.x < line.p1.x: 123 borders[1] = line.width 124 assert line.width 125 break 126 if mask & 0b0001: # Top 127 for line in self._ygrid[self._ypos[y + 1]]: 128 if line.p0.x < mp.x < line.p1.x: 129 borders[3] = line.width 130 assert line.width 131 break 132 133 return Borders(*borders) 134 135 def _fix_borders(self, cells, x: int, y: int): 136 # We are looking at the 9 neighbors around the cells 137 c = cells[(x, y)].borders 138 r = cells[(x + 1, y)].borders if cells[(x + 1, y)] is not None else Borders(0, 0, 1, 0) 139 t = cells[(x, y + 1)].borders if cells[(x, y + 1)] is not None else Borders(0, 1, 0, 0) 140 141 # if (not c.top and c.left and c.right and c.bottom) and "Reset value" in cell.content: 142 # c.top = 1 143 144 # Open at the top into a span 145 if (not c.top and c.right) and (not t.right or not t.left): 146 c.top = 1 147 t.bottom = 1 148 # Open at the top and self is a span 149 if (not c.top and not c.right) and (t.right and t.left): 150 c.top = 1 151 t.bottom = 1 152 153 # Open to the right into a span 154 if (not c.right and c.top) and (not r.top or not r.bottom): 155 c.right = 1 156 r.left = 1 157 # Open to the right and self is a span 158 if (not c.right and not c.top) and (r.top and r.bottom): 159 c.right = 1 160 r.left = 1 161 162 @property 163 def cells(self) -> list[Cell]: 164 if self._cells is None: 165 if self.grid < (1, 1): 166 self._cells = [] 167 return self._cells 168 169 # First determine the spans of cells by checking the borders 170 cells = defaultdict(lambda: None) 171 for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])): 172 for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])): 173 bbox = Rectangle(x0, y0, x1, y1) 174 borders = self._cell_borders(xi, yi, bbox, 0b1111) 175 cells[(xi, yi)] = Cell(self, (self.grid[1] - 1 - yi, xi), bbox, borders, self._type == "register") 176 177 # Fix table cell borders via consistency checks 178 for yi in range(self.grid[1]): 179 for xi in range(self.grid[0]): 180 self._fix_borders(cells, xi, yi) 181 182 # Merge the cells recursively 183 def _merge(px, py, x, y): 184 if cells[(x, y)] is None: 185 return 186 # print(cells[(x, y)]) 187 # Right border is open 188 if not cells[(x, y)].borders.right: 189 if cells[(x + 1, y)] is not None: 190 cells[(px, py)]._merge(cells[(x + 1, y)]) 191 _merge(px, py, x + 1, y) 192 cells[(x + 1, y)] = None 193 # Top border is open 194 if not cells[(x, y)].borders.top: 195 if cells[(x, y + 1)] is not None: 196 cells[(px, py)]._merge(cells[(x, y + 1)]) 197 _merge(px, py, x, y + 1) 198 cells[(x, y + 1)] = None 199 200 # Start merging in bottom left cell 201 for yi in range(self.grid[1]): 202 for xi in range(self.grid[0]): 203 _merge(xi, yi, xi, yi) 204 205 # Find the header line, it is thicker than normal 206 y_header_pos = self.grid[1] 207 if self._type != "register": 208 if self.grid[1] > 1: 209 line_widths = { 210 round(line.width, 1) for llist in self._ygrid.values() for line in llist if line.width != 0.1 211 } # magic width of virtual borders 212 if line_widths: 213 line_width_max = max(line_widths) * 0.9 214 if min(line_widths) < line_width_max: 215 # Find the first thick line starting from the top 216 y_header_pos = next( 217 ( 218 yi 219 for yi, ypos in reversed(list(enumerate(self._ypos))) 220 if any(line.width > line_width_max for line in self._ygrid[ypos]) 221 ), 222 y_header_pos, 223 ) 224 225 # Map all the header 226 is_bold = [] 227 for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]): 228 bbox = None 229 for xi in range(self.grid[0]): 230 if (cell := cells[(xi, yi)]) is not None: 231 if bbox is None: 232 bbox = cell.bbox 233 else: 234 bbox = bbox.joined(cell.bbox) 235 if bbox is None: 236 continue 237 chars = self._page.chars_in_area(bbox) 238 is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1 239 is_bold.append((yi, is_bold_pct > self._spacing["th"])) 240 241 # Some tables have no bold cells at all 242 if all(not b[1] for b in is_bold): 243 # Special case for two row tables without bold headers, but a bold line inbetween 244 if self.grid[1] == 2 and y_header_pos == 1: 245 y_header_pos = 2 246 else: 247 if y_header_pos < self.grid[1]: 248 # Find the lowest bold row starting from bold line 249 y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos) 250 else: 251 # Find the lowest bold row starting from the top 252 for b in reversed(is_bold): 253 if not b[1]: 254 break 255 y_header_pos = b[0] 256 257 # Tell the header cells 258 for yi in range(y_header_pos, self.grid[1]): 259 for xi in range(self.grid[0]): 260 if (cell := cells[(xi, yi)]) is not None: 261 cell.is_header = True 262 263 # Flatten into array 264 cells = [c for c in cells.values() if c is not None] 265 266 # Normalize cells for registers by moving the lower ones right and up 267 if self._type == "register" and self._bit_headers is not None: 268 for cell in cells: 269 if cell.y >= self._bit_headers: 270 cell._move(16, -self._bit_headers) 271 elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1: 272 cell._expand(0, 3 - self._bit_headers) 273 self.grid = (32, 4) 274 275 self._cells = list(sorted(cells, key=lambda c: c.positions[0])) 276 277 return self._cells 278 279 def append_bottom(self, other, merge_headers=True) -> bool: 280 debug = False 281 xgrid = self.grid[0] 282 if merge_headers and xgrid != other.grid[0]: 283 # Some tables have different column layouts due to span cells 284 # So we must correct the X positions of all cells accordingly 285 self_xheaders = defaultdict(set) 286 other_xheaders = defaultdict(set) 287 self_headers = [c for c in self.cells if c.is_header] 288 other_headers = [c for c in other.cells if c.is_header] 289 # Find the smallest set of spanning xpositions based on the header cells 290 for xpos in range(self.grid[0]): 291 for hcell in self_headers: 292 if any(p[1] == xpos for p in hcell.positions): 293 self_xheaders[hcell.x].add(xpos) 294 for xpos in range(other.grid[0]): 295 for hcell in other_headers: 296 if any(p[1] == xpos for p in hcell.positions): 297 other_xheaders[hcell.x].add(xpos) 298 299 # Compute the shared 300 self_heads = sorted(self_xheaders.keys()) 301 other_heads = sorted(other_xheaders.keys()) 302 xgrid = 0 303 merged_xheaders = defaultdict(set) 304 # Zip the groups together, these represent the matching header group spans 305 for self_xhead, other_xhead in zip(self_heads, other_heads): 306 size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead])) 307 merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size)) 308 xgrid += size 309 310 if debug: 311 print(len(self_xheaders), self_xheaders) 312 print(len(other_xheaders), other_xheaders) 313 print(len(merged_xheaders), merged_xheaders) 314 # If they are not equal length the table layouts are not compatible at all! 315 if len(self_heads) != len(other_heads): 316 _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") 317 return False 318 319 # We want to stuff/move the cell positions inplace, therefore we start 320 # backwards moving the high numbers even higher, so that we don't 321 # overwrite ourselves and get stuck in an infinite loop 322 # Zip the groups together, these represent the matching header group spans 323 for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)): 324 merged_xhead = max(self_xhead, other_xhead) 325 self_xpos = sorted(self_xheaders[self_xhead], reverse=True) 326 other_xpos = sorted(other_xheaders[other_xhead], reverse=True) 327 merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True) 328 329 def _insert_cells(cell, src, dsts, insert_only): 330 assert dsts 331 new_positions = [] 332 any_change = False 333 for cpos in reversed(cell.positions): 334 if insert_only: 335 # If our set is empty we must only insert positions 336 if cpos[1] == src: 337 for xpos in dsts: 338 if debug: 339 print(f"Insert {cpos}++{(cpos[0], xpos)}") 340 new_positions.append((cpos[0], xpos)) 341 any_change = True 342 new_positions.append(cpos) 343 else: 344 # We must move (=replace and add) the span positions 345 if cpos[1] == src: 346 if debug: 347 print(f"Move {cpos}->{(cpos[0], dsts[0])}") 348 new_positions.append((cpos[0], dsts[0])) 349 any_change = True 350 else: 351 new_positions.append(cpos) 352 if debug and any_change: 353 print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}") 354 print("old=", cell.positions, "new=", sorted(new_positions)) 355 print() 356 assert new_positions 357 assert len(new_positions) == len(set(new_positions)) 358 cell.positions = sorted(new_positions) 359 cell._invalidate() 360 361 def _move_cells(cells, own_xpos): 362 if debug: 363 print() 364 print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======") 365 print() 366 367 for ii in range(max(len(own_xpos), len(merged_xpos))): 368 insert_only = ii >= len(own_xpos) 369 if insert_only: 370 src = merged_xpos[ii - 1] 371 dsts = merged_xpos[ii:] 372 if debug: 373 print(f"{src}->{dsts} I") 374 for cell in cells: 375 _insert_cells(cell, src, dsts, True) 376 break 377 else: 378 src = own_xpos[ii] 379 dsts = merged_xpos[ii : ii + 1] 380 if debug: 381 print(f"{src}->{dsts} M") 382 for cell in cells: 383 _insert_cells(cell, src, dsts, False) 384 385 if debug: 386 print() 387 if self_xpos != merged_xpos: 388 if debug: 389 print(f"====== Self: x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}") 390 _move_cells(self.cells, self_xpos) 391 if other_xpos != merged_xpos: 392 if debug: 393 print( 394 f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}" 395 ) 396 _move_cells(other.cells, other_xpos) 397 if debug: 398 print() 399 print() 400 print() 401 402 # We must move the cells downwards now, but minus the header rows 403 rows = self.grid[1] - other.header_rows 404 for cell in other.cells: 405 # Discard the header cells, we just assume they are the same 406 if not cell.is_header: 407 cell._move(0, rows) 408 self.cells.append(cell) 409 self.cells.sort(key=lambda c: c.positions[0]) 410 self.grid = (xgrid, other.grid[1] + rows) 411 if debug: 412 print(f"{self._page} -> {self.grid}") 413 return True 414 415 def append_side(self, other, expand=False) -> bool: 416 if self.grid[1] != other.grid[1]: 417 if expand: 418 _LOGGER.debug( 419 f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})" 420 ) 421 ymin = min(self.grid[1], other.grid[1]) 422 ymax = max(self.grid[1], other.grid[1]) 423 etable = other if self.grid[1] > other.grid[1] else self 424 for cell in etable.cells: 425 if any(p[0] == ymin - 1 for p in cell.positions): 426 cell._expand(0, ymax - ymin) 427 etable.grid = (etable.grid[0], ymax) 428 else: 429 _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") 430 return False 431 432 # We must move all cells to the right now 433 columns = self.grid[0] 434 for cell in other.cells: 435 cell._move(columns, 0) 436 self.cells.append(cell) 437 self.cells.sort(key=lambda c: c.positions[0]) 438 self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1])) 439 return True 440 441 @cached_property 442 def header_rows(self) -> int: 443 header_cells = [c for c in self.cells if c.is_header] 444 if header_cells: 445 return max(c.positions[-1][0] + 1 for c in header_cells) 446 return 0 447 448 def __repr__(self) -> str: 449 return f"Table({self.grid[0]}x{self.grid[1]})" 450 451 452class VirtualTable(Table): 453 def __init__(self, page, bbox, cells, table_type=None): 454 self._page = page 455 self._spacing = page._spacing 456 self._type = table_type or "virtual" 457 self.bbox = bbox 458 self._cells = cells 459 self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1) 460 for cell in cells: 461 cell._table = self 462 463 def __repr__(self) -> str: 464 return f"VTable({self.grid[0]}x{self.grid[1]})"
class
Table:
15class Table: 16 def __init__( 17 self, page, bbox: Rectangle, xlines: list, ylines: list, cbbox: Rectangle = None, is_register: bool = False 18 ): 19 self._page = page 20 self._spacing = page._spacing 21 self.bbox = bbox 22 self.cbbox = None if is_register else cbbox 23 self._type = "table" 24 self._bit_headers = None 25 26 # Coalesce the vertical lines to detect the grid 27 def _cluster(lines, key): 28 atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4 29 grid = defaultdict(list) 30 last = -1e9 31 current = -1e9 32 for line in sorted(lines, key=key): 33 if (last + atol) < key(line): 34 current = key(line) 35 grid[current].append(line) 36 last = key(line) 37 return grid 38 39 xgrid = _cluster(xlines, lambda line: line.p0.x) 40 ygrid = _cluster(ylines, lambda line: line.p0.y) 41 42 if is_register: 43 self._type = "register" 44 45 # Find the positions of the top numbers 46 clusters = [] 47 if lines := self._page.charlines_in_area(cbbox): 48 if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)): 49 clusters.append((cluster, cbbox)) 50 else: 51 self.grid = (0, 0) 52 _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") 53 54 # Find the positions of the second row of numbers 55 if len(ygrid) > 2: 56 for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])): 57 nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, self.bbox.right, ygrid[ypos1][0].p0.y) 58 if lines := self._page.charlines_in_area(nbbox): 59 if all(c.char.isnumeric() or c.unicode in {0x20, 0xA, 0xD} for c in lines[0].chars): 60 if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16: 61 clusters.append((cluster, nbbox)) 62 self._bit_headers = len(ygrid) - yi - 1 63 else: 64 self.grid = (len(cluster), 0) 65 _LOGGER.warning( 66 f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})" 67 ) 68 break 69 70 # Merge these clusters to find their positions 71 for cluster, bbox in clusters: 72 # Close left and right side 73 xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top)) 74 xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top)) 75 # Now close the lines in between 76 for cleft, cright in zip(cluster, cluster[1:]): 77 # find a line between the clusters 78 xpos = next( 79 ( 80 (x, xgrid[x][0].p0.x) 81 for x in sorted(xgrid) 82 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left 83 ), 84 None, 85 ) 86 # Didn't find one, we must add one manually 87 if xpos is None: 88 xpos = (cleft.bbox.right + cright.bbox.left) / 2 89 xpos = (int(round(xpos)), xpos) 90 # Add it to the grid 91 xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top)) 92 # close the top 93 ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right)) 94 95 # Fix the position keys properly 96 self._xgrid = {int(round(statistics.fmean(m.p0.x for m in line))): line for line in xgrid.values()} 97 self._ygrid = {int(round(statistics.fmean(m.p0.y for m in line))): line for line in ygrid.values()} 98 # Map the positions to integers 99 self._xpos = list(sorted(self._xgrid)) 100 self._ypos = list(sorted(self._ygrid)) 101 102 self.grid = (len(self._xpos) - 1, len(self._ypos) - 1) 103 self._cells = None 104 105 def _cell_borders(self, x: int, y: int, bbox: Rectangle, mask: int = 0b1111) -> tuple[int, int, int, int]: 106 # left, bottom, right, top 107 borders = [0, 0, 0, 0] 108 mp = bbox.midpoint 109 if mask & 0b1000: # Left 110 for line in self._xgrid[self._xpos[x]]: 111 if line.p0.y < mp.y < line.p1.y: 112 borders[0] = line.width 113 assert line.width 114 break 115 if mask & 0b0010: # Right 116 for line in self._xgrid[self._xpos[x + 1]]: 117 if line.p0.y < mp.y < line.p1.y: 118 borders[2] = line.width 119 assert line.width 120 break 121 if mask & 0b0100: # Bottom 122 for line in self._ygrid[self._ypos[y]]: 123 if line.p0.x < mp.x < line.p1.x: 124 borders[1] = line.width 125 assert line.width 126 break 127 if mask & 0b0001: # Top 128 for line in self._ygrid[self._ypos[y + 1]]: 129 if line.p0.x < mp.x < line.p1.x: 130 borders[3] = line.width 131 assert line.width 132 break 133 134 return Borders(*borders) 135 136 def _fix_borders(self, cells, x: int, y: int): 137 # We are looking at the 9 neighbors around the cells 138 c = cells[(x, y)].borders 139 r = cells[(x + 1, y)].borders if cells[(x + 1, y)] is not None else Borders(0, 0, 1, 0) 140 t = cells[(x, y + 1)].borders if cells[(x, y + 1)] is not None else Borders(0, 1, 0, 0) 141 142 # if (not c.top and c.left and c.right and c.bottom) and "Reset value" in cell.content: 143 # c.top = 1 144 145 # Open at the top into a span 146 if (not c.top and c.right) and (not t.right or not t.left): 147 c.top = 1 148 t.bottom = 1 149 # Open at the top and self is a span 150 if (not c.top and not c.right) and (t.right and t.left): 151 c.top = 1 152 t.bottom = 1 153 154 # Open to the right into a span 155 if (not c.right and c.top) and (not r.top or not r.bottom): 156 c.right = 1 157 r.left = 1 158 # Open to the right and self is a span 159 if (not c.right and not c.top) and (r.top and r.bottom): 160 c.right = 1 161 r.left = 1 162 163 @property 164 def cells(self) -> list[Cell]: 165 if self._cells is None: 166 if self.grid < (1, 1): 167 self._cells = [] 168 return self._cells 169 170 # First determine the spans of cells by checking the borders 171 cells = defaultdict(lambda: None) 172 for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])): 173 for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])): 174 bbox = Rectangle(x0, y0, x1, y1) 175 borders = self._cell_borders(xi, yi, bbox, 0b1111) 176 cells[(xi, yi)] = Cell(self, (self.grid[1] - 1 - yi, xi), bbox, borders, self._type == "register") 177 178 # Fix table cell borders via consistency checks 179 for yi in range(self.grid[1]): 180 for xi in range(self.grid[0]): 181 self._fix_borders(cells, xi, yi) 182 183 # Merge the cells recursively 184 def _merge(px, py, x, y): 185 if cells[(x, y)] is None: 186 return 187 # print(cells[(x, y)]) 188 # Right border is open 189 if not cells[(x, y)].borders.right: 190 if cells[(x + 1, y)] is not None: 191 cells[(px, py)]._merge(cells[(x + 1, y)]) 192 _merge(px, py, x + 1, y) 193 cells[(x + 1, y)] = None 194 # Top border is open 195 if not cells[(x, y)].borders.top: 196 if cells[(x, y + 1)] is not None: 197 cells[(px, py)]._merge(cells[(x, y + 1)]) 198 _merge(px, py, x, y + 1) 199 cells[(x, y + 1)] = None 200 201 # Start merging in bottom left cell 202 for yi in range(self.grid[1]): 203 for xi in range(self.grid[0]): 204 _merge(xi, yi, xi, yi) 205 206 # Find the header line, it is thicker than normal 207 y_header_pos = self.grid[1] 208 if self._type != "register": 209 if self.grid[1] > 1: 210 line_widths = { 211 round(line.width, 1) for llist in self._ygrid.values() for line in llist if line.width != 0.1 212 } # magic width of virtual borders 213 if line_widths: 214 line_width_max = max(line_widths) * 0.9 215 if min(line_widths) < line_width_max: 216 # Find the first thick line starting from the top 217 y_header_pos = next( 218 ( 219 yi 220 for yi, ypos in reversed(list(enumerate(self._ypos))) 221 if any(line.width > line_width_max for line in self._ygrid[ypos]) 222 ), 223 y_header_pos, 224 ) 225 226 # Map all the header 227 is_bold = [] 228 for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]): 229 bbox = None 230 for xi in range(self.grid[0]): 231 if (cell := cells[(xi, yi)]) is not None: 232 if bbox is None: 233 bbox = cell.bbox 234 else: 235 bbox = bbox.joined(cell.bbox) 236 if bbox is None: 237 continue 238 chars = self._page.chars_in_area(bbox) 239 is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1 240 is_bold.append((yi, is_bold_pct > self._spacing["th"])) 241 242 # Some tables have no bold cells at all 243 if all(not b[1] for b in is_bold): 244 # Special case for two row tables without bold headers, but a bold line inbetween 245 if self.grid[1] == 2 and y_header_pos == 1: 246 y_header_pos = 2 247 else: 248 if y_header_pos < self.grid[1]: 249 # Find the lowest bold row starting from bold line 250 y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos) 251 else: 252 # Find the lowest bold row starting from the top 253 for b in reversed(is_bold): 254 if not b[1]: 255 break 256 y_header_pos = b[0] 257 258 # Tell the header cells 259 for yi in range(y_header_pos, self.grid[1]): 260 for xi in range(self.grid[0]): 261 if (cell := cells[(xi, yi)]) is not None: 262 cell.is_header = True 263 264 # Flatten into array 265 cells = [c for c in cells.values() if c is not None] 266 267 # Normalize cells for registers by moving the lower ones right and up 268 if self._type == "register" and self._bit_headers is not None: 269 for cell in cells: 270 if cell.y >= self._bit_headers: 271 cell._move(16, -self._bit_headers) 272 elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1: 273 cell._expand(0, 3 - self._bit_headers) 274 self.grid = (32, 4) 275 276 self._cells = list(sorted(cells, key=lambda c: c.positions[0])) 277 278 return self._cells 279 280 def append_bottom(self, other, merge_headers=True) -> bool: 281 debug = False 282 xgrid = self.grid[0] 283 if merge_headers and xgrid != other.grid[0]: 284 # Some tables have different column layouts due to span cells 285 # So we must correct the X positions of all cells accordingly 286 self_xheaders = defaultdict(set) 287 other_xheaders = defaultdict(set) 288 self_headers = [c for c in self.cells if c.is_header] 289 other_headers = [c for c in other.cells if c.is_header] 290 # Find the smallest set of spanning xpositions based on the header cells 291 for xpos in range(self.grid[0]): 292 for hcell in self_headers: 293 if any(p[1] == xpos for p in hcell.positions): 294 self_xheaders[hcell.x].add(xpos) 295 for xpos in range(other.grid[0]): 296 for hcell in other_headers: 297 if any(p[1] == xpos for p in hcell.positions): 298 other_xheaders[hcell.x].add(xpos) 299 300 # Compute the shared 301 self_heads = sorted(self_xheaders.keys()) 302 other_heads = sorted(other_xheaders.keys()) 303 xgrid = 0 304 merged_xheaders = defaultdict(set) 305 # Zip the groups together, these represent the matching header group spans 306 for self_xhead, other_xhead in zip(self_heads, other_heads): 307 size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead])) 308 merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size)) 309 xgrid += size 310 311 if debug: 312 print(len(self_xheaders), self_xheaders) 313 print(len(other_xheaders), other_xheaders) 314 print(len(merged_xheaders), merged_xheaders) 315 # If they are not equal length the table layouts are not compatible at all! 316 if len(self_heads) != len(other_heads): 317 _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") 318 return False 319 320 # We want to stuff/move the cell positions inplace, therefore we start 321 # backwards moving the high numbers even higher, so that we don't 322 # overwrite ourselves and get stuck in an infinite loop 323 # Zip the groups together, these represent the matching header group spans 324 for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)): 325 merged_xhead = max(self_xhead, other_xhead) 326 self_xpos = sorted(self_xheaders[self_xhead], reverse=True) 327 other_xpos = sorted(other_xheaders[other_xhead], reverse=True) 328 merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True) 329 330 def _insert_cells(cell, src, dsts, insert_only): 331 assert dsts 332 new_positions = [] 333 any_change = False 334 for cpos in reversed(cell.positions): 335 if insert_only: 336 # If our set is empty we must only insert positions 337 if cpos[1] == src: 338 for xpos in dsts: 339 if debug: 340 print(f"Insert {cpos}++{(cpos[0], xpos)}") 341 new_positions.append((cpos[0], xpos)) 342 any_change = True 343 new_positions.append(cpos) 344 else: 345 # We must move (=replace and add) the span positions 346 if cpos[1] == src: 347 if debug: 348 print(f"Move {cpos}->{(cpos[0], dsts[0])}") 349 new_positions.append((cpos[0], dsts[0])) 350 any_change = True 351 else: 352 new_positions.append(cpos) 353 if debug and any_change: 354 print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}") 355 print("old=", cell.positions, "new=", sorted(new_positions)) 356 print() 357 assert new_positions 358 assert len(new_positions) == len(set(new_positions)) 359 cell.positions = sorted(new_positions) 360 cell._invalidate() 361 362 def _move_cells(cells, own_xpos): 363 if debug: 364 print() 365 print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======") 366 print() 367 368 for ii in range(max(len(own_xpos), len(merged_xpos))): 369 insert_only = ii >= len(own_xpos) 370 if insert_only: 371 src = merged_xpos[ii - 1] 372 dsts = merged_xpos[ii:] 373 if debug: 374 print(f"{src}->{dsts} I") 375 for cell in cells: 376 _insert_cells(cell, src, dsts, True) 377 break 378 else: 379 src = own_xpos[ii] 380 dsts = merged_xpos[ii : ii + 1] 381 if debug: 382 print(f"{src}->{dsts} M") 383 for cell in cells: 384 _insert_cells(cell, src, dsts, False) 385 386 if debug: 387 print() 388 if self_xpos != merged_xpos: 389 if debug: 390 print(f"====== Self: x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}") 391 _move_cells(self.cells, self_xpos) 392 if other_xpos != merged_xpos: 393 if debug: 394 print( 395 f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}" 396 ) 397 _move_cells(other.cells, other_xpos) 398 if debug: 399 print() 400 print() 401 print() 402 403 # We must move the cells downwards now, but minus the header rows 404 rows = self.grid[1] - other.header_rows 405 for cell in other.cells: 406 # Discard the header cells, we just assume they are the same 407 if not cell.is_header: 408 cell._move(0, rows) 409 self.cells.append(cell) 410 self.cells.sort(key=lambda c: c.positions[0]) 411 self.grid = (xgrid, other.grid[1] + rows) 412 if debug: 413 print(f"{self._page} -> {self.grid}") 414 return True 415 416 def append_side(self, other, expand=False) -> bool: 417 if self.grid[1] != other.grid[1]: 418 if expand: 419 _LOGGER.debug( 420 f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})" 421 ) 422 ymin = min(self.grid[1], other.grid[1]) 423 ymax = max(self.grid[1], other.grid[1]) 424 etable = other if self.grid[1] > other.grid[1] else self 425 for cell in etable.cells: 426 if any(p[0] == ymin - 1 for p in cell.positions): 427 cell._expand(0, ymax - ymin) 428 etable.grid = (etable.grid[0], ymax) 429 else: 430 _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") 431 return False 432 433 # We must move all cells to the right now 434 columns = self.grid[0] 435 for cell in other.cells: 436 cell._move(columns, 0) 437 self.cells.append(cell) 438 self.cells.sort(key=lambda c: c.positions[0]) 439 self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1])) 440 return True 441 442 @cached_property 443 def header_rows(self) -> int: 444 header_cells = [c for c in self.cells if c.is_header] 445 if header_cells: 446 return max(c.positions[-1][0] + 1 for c in header_cells) 447 return 0 448 449 def __repr__(self) -> str: 450 return f"Table({self.grid[0]}x{self.grid[1]})"
Table( page, bbox: modm_data.utils.Rectangle, xlines: list, ylines: list, cbbox: modm_data.utils.Rectangle = None, is_register: bool = False)
16 def __init__( 17 self, page, bbox: Rectangle, xlines: list, ylines: list, cbbox: Rectangle = None, is_register: bool = False 18 ): 19 self._page = page 20 self._spacing = page._spacing 21 self.bbox = bbox 22 self.cbbox = None if is_register else cbbox 23 self._type = "table" 24 self._bit_headers = None 25 26 # Coalesce the vertical lines to detect the grid 27 def _cluster(lines, key): 28 atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4 29 grid = defaultdict(list) 30 last = -1e9 31 current = -1e9 32 for line in sorted(lines, key=key): 33 if (last + atol) < key(line): 34 current = key(line) 35 grid[current].append(line) 36 last = key(line) 37 return grid 38 39 xgrid = _cluster(xlines, lambda line: line.p0.x) 40 ygrid = _cluster(ylines, lambda line: line.p0.y) 41 42 if is_register: 43 self._type = "register" 44 45 # Find the positions of the top numbers 46 clusters = [] 47 if lines := self._page.charlines_in_area(cbbox): 48 if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)): 49 clusters.append((cluster, cbbox)) 50 else: 51 self.grid = (0, 0) 52 _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") 53 54 # Find the positions of the second row of numbers 55 if len(ygrid) > 2: 56 for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])): 57 nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, self.bbox.right, ygrid[ypos1][0].p0.y) 58 if lines := self._page.charlines_in_area(nbbox): 59 if all(c.char.isnumeric() or c.unicode in {0x20, 0xA, 0xD} for c in lines[0].chars): 60 if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16: 61 clusters.append((cluster, nbbox)) 62 self._bit_headers = len(ygrid) - yi - 1 63 else: 64 self.grid = (len(cluster), 0) 65 _LOGGER.warning( 66 f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})" 67 ) 68 break 69 70 # Merge these clusters to find their positions 71 for cluster, bbox in clusters: 72 # Close left and right side 73 xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top)) 74 xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top)) 75 # Now close the lines in between 76 for cleft, cright in zip(cluster, cluster[1:]): 77 # find a line between the clusters 78 xpos = next( 79 ( 80 (x, xgrid[x][0].p0.x) 81 for x in sorted(xgrid) 82 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left 83 ), 84 None, 85 ) 86 # Didn't find one, we must add one manually 87 if xpos is None: 88 xpos = (cleft.bbox.right + cright.bbox.left) / 2 89 xpos = (int(round(xpos)), xpos) 90 # Add it to the grid 91 xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top)) 92 # close the top 93 ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right)) 94 95 # Fix the position keys properly 96 self._xgrid = {int(round(statistics.fmean(m.p0.x for m in line))): line for line in xgrid.values()} 97 self._ygrid = {int(round(statistics.fmean(m.p0.y for m in line))): line for line in ygrid.values()} 98 # Map the positions to integers 99 self._xpos = list(sorted(self._xgrid)) 100 self._ypos = list(sorted(self._ygrid)) 101 102 self.grid = (len(self._xpos) - 1, len(self._ypos) - 1) 103 self._cells = None
cells: list[modm_data.pdf2html.cell.Cell]
163 @property 164 def cells(self) -> list[Cell]: 165 if self._cells is None: 166 if self.grid < (1, 1): 167 self._cells = [] 168 return self._cells 169 170 # First determine the spans of cells by checking the borders 171 cells = defaultdict(lambda: None) 172 for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])): 173 for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])): 174 bbox = Rectangle(x0, y0, x1, y1) 175 borders = self._cell_borders(xi, yi, bbox, 0b1111) 176 cells[(xi, yi)] = Cell(self, (self.grid[1] - 1 - yi, xi), bbox, borders, self._type == "register") 177 178 # Fix table cell borders via consistency checks 179 for yi in range(self.grid[1]): 180 for xi in range(self.grid[0]): 181 self._fix_borders(cells, xi, yi) 182 183 # Merge the cells recursively 184 def _merge(px, py, x, y): 185 if cells[(x, y)] is None: 186 return 187 # print(cells[(x, y)]) 188 # Right border is open 189 if not cells[(x, y)].borders.right: 190 if cells[(x + 1, y)] is not None: 191 cells[(px, py)]._merge(cells[(x + 1, y)]) 192 _merge(px, py, x + 1, y) 193 cells[(x + 1, y)] = None 194 # Top border is open 195 if not cells[(x, y)].borders.top: 196 if cells[(x, y + 1)] is not None: 197 cells[(px, py)]._merge(cells[(x, y + 1)]) 198 _merge(px, py, x, y + 1) 199 cells[(x, y + 1)] = None 200 201 # Start merging in bottom left cell 202 for yi in range(self.grid[1]): 203 for xi in range(self.grid[0]): 204 _merge(xi, yi, xi, yi) 205 206 # Find the header line, it is thicker than normal 207 y_header_pos = self.grid[1] 208 if self._type != "register": 209 if self.grid[1] > 1: 210 line_widths = { 211 round(line.width, 1) for llist in self._ygrid.values() for line in llist if line.width != 0.1 212 } # magic width of virtual borders 213 if line_widths: 214 line_width_max = max(line_widths) * 0.9 215 if min(line_widths) < line_width_max: 216 # Find the first thick line starting from the top 217 y_header_pos = next( 218 ( 219 yi 220 for yi, ypos in reversed(list(enumerate(self._ypos))) 221 if any(line.width > line_width_max for line in self._ygrid[ypos]) 222 ), 223 y_header_pos, 224 ) 225 226 # Map all the header 227 is_bold = [] 228 for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]): 229 bbox = None 230 for xi in range(self.grid[0]): 231 if (cell := cells[(xi, yi)]) is not None: 232 if bbox is None: 233 bbox = cell.bbox 234 else: 235 bbox = bbox.joined(cell.bbox) 236 if bbox is None: 237 continue 238 chars = self._page.chars_in_area(bbox) 239 is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1 240 is_bold.append((yi, is_bold_pct > self._spacing["th"])) 241 242 # Some tables have no bold cells at all 243 if all(not b[1] for b in is_bold): 244 # Special case for two row tables without bold headers, but a bold line inbetween 245 if self.grid[1] == 2 and y_header_pos == 1: 246 y_header_pos = 2 247 else: 248 if y_header_pos < self.grid[1]: 249 # Find the lowest bold row starting from bold line 250 y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos) 251 else: 252 # Find the lowest bold row starting from the top 253 for b in reversed(is_bold): 254 if not b[1]: 255 break 256 y_header_pos = b[0] 257 258 # Tell the header cells 259 for yi in range(y_header_pos, self.grid[1]): 260 for xi in range(self.grid[0]): 261 if (cell := cells[(xi, yi)]) is not None: 262 cell.is_header = True 263 264 # Flatten into array 265 cells = [c for c in cells.values() if c is not None] 266 267 # Normalize cells for registers by moving the lower ones right and up 268 if self._type == "register" and self._bit_headers is not None: 269 for cell in cells: 270 if cell.y >= self._bit_headers: 271 cell._move(16, -self._bit_headers) 272 elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1: 273 cell._expand(0, 3 - self._bit_headers) 274 self.grid = (32, 4) 275 276 self._cells = list(sorted(cells, key=lambda c: c.positions[0])) 277 278 return self._cells
def
append_bottom(self, other, merge_headers=True) -> bool:
280 def append_bottom(self, other, merge_headers=True) -> bool: 281 debug = False 282 xgrid = self.grid[0] 283 if merge_headers and xgrid != other.grid[0]: 284 # Some tables have different column layouts due to span cells 285 # So we must correct the X positions of all cells accordingly 286 self_xheaders = defaultdict(set) 287 other_xheaders = defaultdict(set) 288 self_headers = [c for c in self.cells if c.is_header] 289 other_headers = [c for c in other.cells if c.is_header] 290 # Find the smallest set of spanning xpositions based on the header cells 291 for xpos in range(self.grid[0]): 292 for hcell in self_headers: 293 if any(p[1] == xpos for p in hcell.positions): 294 self_xheaders[hcell.x].add(xpos) 295 for xpos in range(other.grid[0]): 296 for hcell in other_headers: 297 if any(p[1] == xpos for p in hcell.positions): 298 other_xheaders[hcell.x].add(xpos) 299 300 # Compute the shared 301 self_heads = sorted(self_xheaders.keys()) 302 other_heads = sorted(other_xheaders.keys()) 303 xgrid = 0 304 merged_xheaders = defaultdict(set) 305 # Zip the groups together, these represent the matching header group spans 306 for self_xhead, other_xhead in zip(self_heads, other_heads): 307 size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead])) 308 merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size)) 309 xgrid += size 310 311 if debug: 312 print(len(self_xheaders), self_xheaders) 313 print(len(other_xheaders), other_xheaders) 314 print(len(merged_xheaders), merged_xheaders) 315 # If they are not equal length the table layouts are not compatible at all! 316 if len(self_heads) != len(other_heads): 317 _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") 318 return False 319 320 # We want to stuff/move the cell positions inplace, therefore we start 321 # backwards moving the high numbers even higher, so that we don't 322 # overwrite ourselves and get stuck in an infinite loop 323 # Zip the groups together, these represent the matching header group spans 324 for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)): 325 merged_xhead = max(self_xhead, other_xhead) 326 self_xpos = sorted(self_xheaders[self_xhead], reverse=True) 327 other_xpos = sorted(other_xheaders[other_xhead], reverse=True) 328 merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True) 329 330 def _insert_cells(cell, src, dsts, insert_only): 331 assert dsts 332 new_positions = [] 333 any_change = False 334 for cpos in reversed(cell.positions): 335 if insert_only: 336 # If our set is empty we must only insert positions 337 if cpos[1] == src: 338 for xpos in dsts: 339 if debug: 340 print(f"Insert {cpos}++{(cpos[0], xpos)}") 341 new_positions.append((cpos[0], xpos)) 342 any_change = True 343 new_positions.append(cpos) 344 else: 345 # We must move (=replace and add) the span positions 346 if cpos[1] == src: 347 if debug: 348 print(f"Move {cpos}->{(cpos[0], dsts[0])}") 349 new_positions.append((cpos[0], dsts[0])) 350 any_change = True 351 else: 352 new_positions.append(cpos) 353 if debug and any_change: 354 print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}") 355 print("old=", cell.positions, "new=", sorted(new_positions)) 356 print() 357 assert new_positions 358 assert len(new_positions) == len(set(new_positions)) 359 cell.positions = sorted(new_positions) 360 cell._invalidate() 361 362 def _move_cells(cells, own_xpos): 363 if debug: 364 print() 365 print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======") 366 print() 367 368 for ii in range(max(len(own_xpos), len(merged_xpos))): 369 insert_only = ii >= len(own_xpos) 370 if insert_only: 371 src = merged_xpos[ii - 1] 372 dsts = merged_xpos[ii:] 373 if debug: 374 print(f"{src}->{dsts} I") 375 for cell in cells: 376 _insert_cells(cell, src, dsts, True) 377 break 378 else: 379 src = own_xpos[ii] 380 dsts = merged_xpos[ii : ii + 1] 381 if debug: 382 print(f"{src}->{dsts} M") 383 for cell in cells: 384 _insert_cells(cell, src, dsts, False) 385 386 if debug: 387 print() 388 if self_xpos != merged_xpos: 389 if debug: 390 print(f"====== Self: x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}") 391 _move_cells(self.cells, self_xpos) 392 if other_xpos != merged_xpos: 393 if debug: 394 print( 395 f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}" 396 ) 397 _move_cells(other.cells, other_xpos) 398 if debug: 399 print() 400 print() 401 print() 402 403 # We must move the cells downwards now, but minus the header rows 404 rows = self.grid[1] - other.header_rows 405 for cell in other.cells: 406 # Discard the header cells, we just assume they are the same 407 if not cell.is_header: 408 cell._move(0, rows) 409 self.cells.append(cell) 410 self.cells.sort(key=lambda c: c.positions[0]) 411 self.grid = (xgrid, other.grid[1] + rows) 412 if debug: 413 print(f"{self._page} -> {self.grid}") 414 return True
def
append_side(self, other, expand=False) -> bool:
416 def append_side(self, other, expand=False) -> bool: 417 if self.grid[1] != other.grid[1]: 418 if expand: 419 _LOGGER.debug( 420 f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})" 421 ) 422 ymin = min(self.grid[1], other.grid[1]) 423 ymax = max(self.grid[1], other.grid[1]) 424 etable = other if self.grid[1] > other.grid[1] else self 425 for cell in etable.cells: 426 if any(p[0] == ymin - 1 for p in cell.positions): 427 cell._expand(0, ymax - ymin) 428 etable.grid = (etable.grid[0], ymax) 429 else: 430 _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") 431 return False 432 433 # We must move all cells to the right now 434 columns = self.grid[0] 435 for cell in other.cells: 436 cell._move(columns, 0) 437 self.cells.append(cell) 438 self.cells.sort(key=lambda c: c.positions[0]) 439 self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1])) 440 return True
453class VirtualTable(Table): 454 def __init__(self, page, bbox, cells, table_type=None): 455 self._page = page 456 self._spacing = page._spacing 457 self._type = table_type or "virtual" 458 self.bbox = bbox 459 self._cells = cells 460 self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1) 461 for cell in cells: 462 cell._table = self 463 464 def __repr__(self) -> str: 465 return f"VTable({self.grid[0]}x{self.grid[1]})"
VirtualTable(page, bbox, cells, table_type=None)
454 def __init__(self, page, bbox, cells, table_type=None): 455 self._page = page 456 self._spacing = page._spacing 457 self._type = table_type or "virtual" 458 self.bbox = bbox 459 self._cells = cells 460 self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1) 461 for cell in cells: 462 cell._table = self