modm_data.pdf2html.stmicro.table

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4import logging
  5import statistics
  6from functools import cached_property
  7from collections import defaultdict
  8from ...utils import HLine, VLine, Rectangle
  9
 10LOGGER = logging.getLogger(__name__)
 11
 12
 13class TableCell:
 14    class Borders:
 15        def __init__(self, l, b, r, t):
 16            self.l = l
 17            self.b = b
 18            self.r = r
 19            self.t = t
 20
 21    def __init__(self, table, position, bbox, borders, is_simple=False):
 22        self._table = table
 23        self._bboxes = [bbox]
 24        self.b = borders
 25        self.positions = [position]
 26        self.is_header = False
 27        self._is_simple = is_simple
 28        self._bbox = None
 29        self._lines = None
 30
 31    def _merge(self, other):
 32        self.positions.extend(other.positions)
 33        self.positions.sort()
 34        self._bboxes.append(other.bbox)
 35        self._bbox = None
 36        self._lines = None
 37
 38    def _move(self, x, y):
 39        self.positions = [(py + y, px + x) for (py, px) in self.positions]
 40        self.positions.sort()
 41
 42    def _expand(self, dx, dy):
 43        ymax, xmax = self.positions[-1]
 44        for yi in range(ymax, ymax + dy + 1):
 45            for xi in range(xmax, xmax + dx + 1):
 46                self.positions.append((yi, xi))
 47        self.positions.sort()
 48
 49    @property
 50    def x(self) -> int:
 51        return self.positions[0][1]
 52
 53    @property
 54    def y(self) -> int:
 55        return self.positions[0][0]
 56
 57    @property
 58    def xspan(self) -> int:
 59        return self.positions[-1][1] - self.positions[0][1] + 1
 60
 61    @property
 62    def yspan(self) -> int:
 63        return self.positions[-1][0] - self.positions[0][0] + 1
 64
 65    @property
 66    def rotation(self) -> int:
 67        if not self.lines: return 0
 68        return self.lines[0].rotation
 69
 70    @property
 71    def bbox(self) -> Rectangle:
 72        if self._bbox is None:
 73            self._bbox = Rectangle(min(bbox.left   for bbox in self._bboxes),
 74                                   min(bbox.bottom for bbox in self._bboxes),
 75                                   max(bbox.right  for bbox in self._bboxes),
 76                                   max(bbox.top    for bbox in self._bboxes))
 77        return self._bbox
 78
 79    @property
 80    def lines(self):
 81        if self._lines is None:
 82            self._lines = self._table._page._charlines_filtered(self.bbox)
 83        return self._lines
 84
 85    @property
 86    def content(self):
 87        return "".join(c.char for line in self.lines for c in line.chars)
 88
 89    @property
 90    def left_aligned(self):
 91        x_em = self._table._page._spacing["x_em"]
 92        for line in self.lines:
 93            if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right):
 94                return True
 95        return False
 96
 97    @property
 98    def ast(self):
 99        ast = self._table._page._ast_filtered(self.bbox, with_graphics=False,
100                                              ignore_xpos=not self.left_aligned,
101                                              with_bits=False, with_notes=False)
102        ast.name = "cell"
103        return ast
104
105    def __repr__(self) -> str:
106        positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions)
107        borders = ""
108        if self.b.l: borders += "["
109        if self.b.b: borders += "_"
110        if self.b.t: borders += "^"
111        if self.b.r: borders += "]"
112        start = "CellH" if self.is_header else "Cell"
113        return start + f"[{positions}] {borders}"
114
115
116class Table:
117    def __init__(self, page, bbox: Rectangle, xlines: list, ylines: list,
118                 cbbox: Rectangle = None, is_register: bool = False):
119        self._page = page
120        self._spacing = page._spacing
121        self.bbox = bbox
122        self.cbbox = None if is_register else cbbox
123        self._type = "table"
124        self._bit_headers = None
125
126        # Coalesce the vertical lines to detect the grid
127        def _cluster(lines, key):
128            atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4
129            grid = defaultdict(list)
130            last = -1e9
131            current = -1e9
132            for line in sorted(lines, key=key):
133                if (last + atol) < key(line):
134                    current = key(line)
135                grid[current].append(line)
136                last = key(line)
137            return grid
138        xgrid = _cluster(xlines, lambda l: l.p0.x)
139        ygrid = _cluster(ylines, lambda l: l.p0.y)
140
141        if is_register:
142            self._type = "register"
143
144            # Find the positions of the top numbers
145            clusters = []
146            if lines := self._page._charlines_filtered(cbbox):
147                if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)):
148                    clusters.append((cluster, cbbox))
149                else:
150                    self.grid = (0, 0)
151                    LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
152
153            # Find the positions of the second row of numbers
154            if len(ygrid) > 2:
155                for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])):
156                    nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y,
157                                      self.bbox.right, ygrid[ypos1][0].p0.y)
158                    if lines := self._page._charlines_filtered(nbbox):
159                        if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars):
160                            if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16:
161                                clusters.append((cluster, nbbox))
162                                self._bit_headers = len(ygrid) - yi - 1
163                            else:
164                                self.grid = (len(cluster), 0)
165                                LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})")
166                            break
167
168            # Merge these clusters to find their positions
169            for cluster, bbox in clusters:
170                # Close left and right side
171                xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top))
172                xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top))
173                # Now close the lines in between
174                for cleft, cright in zip(cluster, cluster[1:]):
175                    # find a line between the clusters
176                    xpos = next(((x, xgrid[x][0].p0.x) for x in sorted(xgrid)
177                                 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left), None)
178                    # Didn't find one, we must add one manually
179                    if xpos is None:
180                        xpos = (cleft.bbox.right + cright.bbox.left) / 2
181                        xpos = (int(round(xpos)), xpos)
182                    # Add it to the grid
183                    xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top))
184            # close the top
185            ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right))
186
187        # Fix the position keys properly
188        self._xgrid = {int(round(statistics.fmean(m.p0.x for m in l))): l
189                       for l in xgrid.values()}
190        self._ygrid = {int(round(statistics.fmean(m.p0.y for m in l))): l
191                       for l in ygrid.values()}
192        # Map the positions to integers
193        self._xpos = list(sorted(self._xgrid))
194        self._ypos = list(sorted(self._ygrid))
195
196        self.grid = (len(self._xpos) - 1, len(self._ypos) - 1)
197        self._cells = None
198
199    def _cell_borders(self, x: int, y: int, bbox: Rectangle,
200                      mask: int = 0b1111) -> tuple[int, int, int, int]:
201        # left, bottom, right, top
202        borders = [0, 0, 0, 0]
203        mp = bbox.midpoint
204        if mask & 0b1000:  # Left
205            for line in self._xgrid[self._xpos[x]]:
206                if line.p0.y < mp.y < line.p1.y:
207                    borders[0] = line.width
208                    assert line.width
209                    break
210        if mask & 0b0010:  # Right
211            for line in self._xgrid[self._xpos[x + 1]]:
212                if line.p0.y < mp.y < line.p1.y:
213                    borders[2] = line.width
214                    assert line.width
215                    break
216        if mask & 0b0100:  # Bottom
217            for line in self._ygrid[self._ypos[y]]:
218                if line.p0.x < mp.x < line.p1.x:
219                    borders[1] = line.width
220                    assert line.width
221                    break
222        if mask & 0b0001:  # Top
223            for line in self._ygrid[self._ypos[y + 1]]:
224                if line.p0.x < mp.x < line.p1.x:
225                    borders[3] = line.width
226                    assert line.width
227                    break
228
229        return TableCell.Borders(*borders)
230
231    def _fix_borders(self, cells, x: int, y: int):
232        # We are looking at the 9 neighbors around the cells
233        cell = cells[(x, y)]
234        c = cells[(x, y)].b
235        r = cells[(x + 1, y)].b if cells[(x + 1, y)] is not None else TableCell.Borders(0, 0, 1, 0)
236        t = cells[(x, y + 1)].b if cells[(x, y + 1)] is not None else TableCell.Borders(0, 1, 0, 0)
237
238        # if (not c.t and c.l and c.r and c.b) and "Reset value" in cell.content:
239        #     c.t = 1
240
241        # Open at the top into a span
242        if (not c.t and c.r) and (not t.r or not t.l):
243            c.t = 1
244            t.b = 1
245        # Open at the top and self is a span
246        if (not c.t and not c.r) and (t.r and t.l):
247            c.t = 1
248            t.b = 1
249
250        # Open to the right into a span
251        if (not c.r and c.t) and (not r.t or not r.b):
252            c.r = 1
253            r.l = 1
254        # Open to the right and self is a span
255        if (not c.r and not c.t) and (r.t and r.b):
256            c.r = 1
257            r.l = 1
258
259    @property
260    def cells(self) -> list[TableCell]:
261        if self._cells is None:
262            if self.grid < (1, 1):
263                self._cells = []
264                return self._cells
265
266            # First determine the spans of cells by checking the borders
267            cells = defaultdict(lambda: None)
268            for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])):
269                for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])):
270                    bbox = Rectangle(x0, y0, x1, y1)
271                    borders = self._cell_borders(xi, yi, bbox, 0b1111)
272                    cells[(xi, yi)] = TableCell(self, (self.grid[1] - 1 - yi, xi),
273                                                bbox, borders, self._type == "register")
274
275            # Fix table cell borders via consistency checks
276            for yi in range(self.grid[1]):
277                for xi in range(self.grid[0]):
278                    self._fix_borders(cells, xi, yi)
279
280            # Merge the cells recursively
281            def _merge(px, py, x, y):
282                if cells[(x, y)] is None:
283                    return
284                # print(cells[(x, y)])
285                # Right border is open
286                if not cells[(x, y)].b.r:
287                    if cells[(x + 1, y)] is not None:
288                        cells[(px, py)]._merge(cells[(x + 1, y)])
289                        _merge(px, py, x + 1, y)
290                        cells[(x + 1, y)] = None
291                # Top border is open
292                if not cells[(x, y)].b.t:
293                    if cells[(x, y + 1)] is not None:
294                        cells[(px, py)]._merge(cells[(x, y + 1)])
295                        _merge(px, py, x, y + 1)
296                        cells[(x, y + 1)] = None
297            # Start merging in bottom left cell
298            for yi in range(self.grid[1]):
299                for xi in range(self.grid[0]):
300                    _merge(xi, yi, xi, yi)
301
302            # Find the header line, it is thicker than normal
303            y_header_pos = self.grid[1]
304            if self._type != "register":
305                if self.grid[1] > 1:
306                    line_widths = {round(line.width, 1) for llist in self._ygrid.values()
307                                   for line in llist if line.width != 0.1} # magic width of virtual borders
308                    if line_widths:
309                        line_width_max = max(line_widths) * 0.9
310                        if min(line_widths) < line_width_max:
311                            # Find the first thick line starting from the top
312                            y_header_pos = next((yi for yi, ypos in reversed(list(enumerate(self._ypos)))
313                                                 if any(line.width > line_width_max for line in self._ygrid[ypos])),
314                                                y_header_pos)
315
316                # Map all the header
317                is_bold = []
318                for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]):
319                    bbox = None
320                    for xi in range(self.grid[0]):
321                        if (cell := cells[(xi, yi)]) is not None:
322                            if bbox is None:
323                                bbox = cell.bbox
324                            else:
325                                bbox = bbox.joined(cell.bbox)
326                    if bbox is None: continue
327                    chars = self._page.chars_in_area(bbox)
328                    is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1
329                    is_bold.append((yi, is_bold_pct > self._spacing["th"]))
330
331                # Some tables have no bold cells at all
332                if all(not b[1] for b in is_bold):
333                    # Special case for two row tables without bold headers, but a bold line inbetween
334                    if self.grid[1] == 2 and y_header_pos == 1: y_header_pos = 2
335                else:
336                    if y_header_pos < self.grid[1]:
337                        # Find the lowest bold row starting from bold line
338                        y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos)
339                    else:
340                        # Find the lowest bold row starting from the top
341                        for b in reversed(is_bold):
342                            if not b[1]: break
343                            y_header_pos = b[0]
344
345            # Tell the header cells
346            for yi in range(y_header_pos, self.grid[1]):
347                for xi in range(self.grid[0]):
348                    if (cell := cells[(xi, yi)]) is not None:
349                        cell.is_header = True
350
351            # Flatten into array
352            cells = [c for c in cells.values() if c is not None]
353
354            # Normalize cells for registers by moving the lower ones right and up
355            if self._type == "register" and self._bit_headers is not None:
356                for cell in cells:
357                    if cell.y >= self._bit_headers:
358                        cell._move(16, -self._bit_headers)
359                    elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1:
360                        cell._expand(0, 3 - self._bit_headers)
361                self.grid = (32, 4)
362
363            self._cells = list(sorted(cells, key=lambda c: c.positions[0]))
364
365        return self._cells
366
367    def append_bottom(self, other, merge_headers=True) -> bool:
368        debug = False
369        xgrid = self.grid[0]
370        if merge_headers and xgrid != other.grid[0]:
371            # Some tables have different column layouts due to span cells
372            # So we must correct the X positions of all cells accordingly
373            self_xheaders = defaultdict(set)
374            other_xheaders = defaultdict(set)
375            self_headers = [c for c in self.cells if c.is_header]
376            other_headers = [c for c in other.cells if c.is_header]
377            # Find the smallest set of spanning xpositions based on the header cells
378            for xpos in range(self.grid[0]):
379                for hcell in self_headers:
380                    if any(p[1] == xpos for p in hcell.positions):
381                        self_xheaders[hcell.x].add(xpos)
382            for xpos in range(other.grid[0]):
383                for hcell in other_headers:
384                    if any(p[1] == xpos for p in hcell.positions):
385                        other_xheaders[hcell.x].add(xpos)
386
387            # Compute the shared
388            self_heads = sorted(self_xheaders.keys())
389            other_heads = sorted(other_xheaders.keys())
390            xgrid = 0
391            merged_xheaders = defaultdict(set)
392            # Zip the groups together, these represent the matching header group spans
393            for self_xhead, other_xhead in zip(self_heads, other_heads):
394                size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead]))
395                merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size))
396                xgrid += size
397
398            if debug:
399                print(len(self_xheaders), self_xheaders)
400                print(len(other_xheaders), other_xheaders)
401                print(len(merged_xheaders), merged_xheaders)
402            # If they are not equal length the table layouts are not compatible at all!
403            if len(self_heads) != len(other_heads):
404                LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
405                return False
406
407            # We want to stuff/move the cell positions inplace, therefore we start
408            # backwards moving the high numbers even higher, so that we don't
409            # overwrite ourselves and get stuck in an infinite loop
410            # Zip the groups together, these represent the matching header group spans
411            for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)):
412                merged_xhead = max(self_xhead, other_xhead)
413                self_xpos = sorted(self_xheaders[self_xhead], reverse=True)
414                other_xpos = sorted(other_xheaders[other_xhead], reverse=True)
415                merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True)
416
417                def _insert_cells(cell, src, dsts, insert_only):
418                    assert dsts
419                    new_positions = []
420                    any_change = False
421                    for cpos in reversed(cell.positions):
422                        if insert_only:
423                            # If our set is empty we must only insert positions
424                            if cpos[1] == src:
425                                for xpos in dsts:
426                                    if debug:
427                                        print(f"Insert {cpos}++{(cpos[0], xpos)}")
428                                    new_positions.append((cpos[0], xpos))
429                                    any_change = True
430                            new_positions.append(cpos)
431                        else:
432                            # We must move (=replace and add) the span positions
433                            if cpos[1] == src:
434                                if debug:
435                                    print(f"Move {cpos}->{(cpos[0], dsts[0])}")
436                                new_positions.append((cpos[0], dsts[0]))
437                                any_change = True
438                            else:
439                                new_positions.append(cpos)
440                    if debug and any_change:
441                        print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}")
442                        print("old=", cell.positions, "new=", sorted(new_positions))
443                        print()
444                    assert new_positions
445                    assert len(new_positions) == len(set(new_positions))
446                    cell.positions = sorted(new_positions)
447
448                def _move_cells(cells, own_xpos):
449                    if debug:
450                        print()
451                        print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======")
452                        print()
453
454                    for ii in range(max(len(own_xpos), len(merged_xpos))):
455                        insert_only = ii >= len(own_xpos)
456                        if insert_only:
457                            src = merged_xpos[ii - 1]
458                            dsts = merged_xpos[ii:]
459                            if debug: print(f"{src}->{dsts} I")
460                            for cell in cells:
461                                _insert_cells(cell, src, dsts, True)
462                            break
463                        else:
464                            src = own_xpos[ii]
465                            dsts = merged_xpos[ii:ii + 1]
466                            if debug: print(f"{src}->{dsts} M")
467                            for cell in cells:
468                                _insert_cells(cell, src, dsts, False)
469
470                if debug: print()
471                if self_xpos != merged_xpos:
472                    if debug:
473                        print(f"====== Self:  x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}")
474                    _move_cells(self.cells, self_xpos)
475                if other_xpos != merged_xpos:
476                    if debug:
477                        print(f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}")
478                    _move_cells(other.cells, other_xpos)
479            if debug:
480                print()
481                print()
482                print()
483
484        # We must move the cells downwards now, but minus the header rows
485        rows = self.grid[1] - other.header_rows
486        for cell in other.cells:
487            # Discard the header cells, we just assume they are the same
488            if not cell.is_header:
489                cell._move(0, rows)
490                self.cells.append(cell)
491        self.cells.sort(key=lambda c: c.positions[0])
492        self.grid = (xgrid, other.grid[1] + rows)
493        if debug:
494            print(f"{self._page} -> {self.grid}")
495        return True
496
497    def append_side(self, other, expand=False) -> bool:
498        if self.grid[1] != other.grid[1]:
499            if expand:
500                LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})")
501                ymin = min(self.grid[1], other.grid[1])
502                ymax = max(self.grid[1], other.grid[1])
503                etable = other if self.grid[1] > other.grid[1] else self
504                for cell in etable.cells:
505                    if any(p[0] == ymin - 1 for p in cell.positions):
506                        cell._expand(0, ymax - ymin)
507                etable.grid = (etable.grid[0], ymax)
508            else:
509                LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
510                return False
511
512        # We must move all cells to the right now
513        columns = self.grid[0]
514        for cell in other.cells:
515            cell._move(columns, 0)
516            self.cells.append(cell)
517        self.cells.sort(key=lambda c: c.positions[0])
518        self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1]))
519        return True
520
521    @cached_property
522    def header_rows(self) -> int:
523        header_cells = [c for c in self.cells if c.is_header]
524        if header_cells:
525            return max(c.positions[-1][0] + 1 for c in header_cells)
526        return 0
527
528    def __repr__(self) -> str:
529        return f"Table({self.grid[0]}x{self.grid[1]})"
530
531
532class VirtualTable(Table):
533    def __init__(self, page, bbox, cells, table_type=None):
534        self._page = page
535        self._spacing = page._spacing
536        self._type = table_type or "virtual"
537        self.bbox = bbox
538        self._cells = cells
539        self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1)
540        for cell in cells:
541            cell._table = self
542
543    def __repr__(self) -> str:
544        return f"VTable({self.grid[0]}x{self.grid[1]})"
LOGGER = <Logger modm_data.pdf2html.stmicro.table (WARNING)>
class TableCell:
 14class TableCell:
 15    class Borders:
 16        def __init__(self, l, b, r, t):
 17            self.l = l
 18            self.b = b
 19            self.r = r
 20            self.t = t
 21
 22    def __init__(self, table, position, bbox, borders, is_simple=False):
 23        self._table = table
 24        self._bboxes = [bbox]
 25        self.b = borders
 26        self.positions = [position]
 27        self.is_header = False
 28        self._is_simple = is_simple
 29        self._bbox = None
 30        self._lines = None
 31
 32    def _merge(self, other):
 33        self.positions.extend(other.positions)
 34        self.positions.sort()
 35        self._bboxes.append(other.bbox)
 36        self._bbox = None
 37        self._lines = None
 38
 39    def _move(self, x, y):
 40        self.positions = [(py + y, px + x) for (py, px) in self.positions]
 41        self.positions.sort()
 42
 43    def _expand(self, dx, dy):
 44        ymax, xmax = self.positions[-1]
 45        for yi in range(ymax, ymax + dy + 1):
 46            for xi in range(xmax, xmax + dx + 1):
 47                self.positions.append((yi, xi))
 48        self.positions.sort()
 49
 50    @property
 51    def x(self) -> int:
 52        return self.positions[0][1]
 53
 54    @property
 55    def y(self) -> int:
 56        return self.positions[0][0]
 57
 58    @property
 59    def xspan(self) -> int:
 60        return self.positions[-1][1] - self.positions[0][1] + 1
 61
 62    @property
 63    def yspan(self) -> int:
 64        return self.positions[-1][0] - self.positions[0][0] + 1
 65
 66    @property
 67    def rotation(self) -> int:
 68        if not self.lines: return 0
 69        return self.lines[0].rotation
 70
 71    @property
 72    def bbox(self) -> Rectangle:
 73        if self._bbox is None:
 74            self._bbox = Rectangle(min(bbox.left   for bbox in self._bboxes),
 75                                   min(bbox.bottom for bbox in self._bboxes),
 76                                   max(bbox.right  for bbox in self._bboxes),
 77                                   max(bbox.top    for bbox in self._bboxes))
 78        return self._bbox
 79
 80    @property
 81    def lines(self):
 82        if self._lines is None:
 83            self._lines = self._table._page._charlines_filtered(self.bbox)
 84        return self._lines
 85
 86    @property
 87    def content(self):
 88        return "".join(c.char for line in self.lines for c in line.chars)
 89
 90    @property
 91    def left_aligned(self):
 92        x_em = self._table._page._spacing["x_em"]
 93        for line in self.lines:
 94            if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right):
 95                return True
 96        return False
 97
 98    @property
 99    def ast(self):
100        ast = self._table._page._ast_filtered(self.bbox, with_graphics=False,
101                                              ignore_xpos=not self.left_aligned,
102                                              with_bits=False, with_notes=False)
103        ast.name = "cell"
104        return ast
105
106    def __repr__(self) -> str:
107        positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions)
108        borders = ""
109        if self.b.l: borders += "["
110        if self.b.b: borders += "_"
111        if self.b.t: borders += "^"
112        if self.b.r: borders += "]"
113        start = "CellH" if self.is_header else "Cell"
114        return start + f"[{positions}] {borders}"
TableCell(table, position, bbox, borders, is_simple=False)
22    def __init__(self, table, position, bbox, borders, is_simple=False):
23        self._table = table
24        self._bboxes = [bbox]
25        self.b = borders
26        self.positions = [position]
27        self.is_header = False
28        self._is_simple = is_simple
29        self._bbox = None
30        self._lines = None
b
positions
is_header
x: int
50    @property
51    def x(self) -> int:
52        return self.positions[0][1]
y: int
54    @property
55    def y(self) -> int:
56        return self.positions[0][0]
xspan: int
58    @property
59    def xspan(self) -> int:
60        return self.positions[-1][1] - self.positions[0][1] + 1
yspan: int
62    @property
63    def yspan(self) -> int:
64        return self.positions[-1][0] - self.positions[0][0] + 1
rotation: int
66    @property
67    def rotation(self) -> int:
68        if not self.lines: return 0
69        return self.lines[0].rotation
bbox: modm_data.utils.math.Rectangle
71    @property
72    def bbox(self) -> Rectangle:
73        if self._bbox is None:
74            self._bbox = Rectangle(min(bbox.left   for bbox in self._bboxes),
75                                   min(bbox.bottom for bbox in self._bboxes),
76                                   max(bbox.right  for bbox in self._bboxes),
77                                   max(bbox.top    for bbox in self._bboxes))
78        return self._bbox
lines
80    @property
81    def lines(self):
82        if self._lines is None:
83            self._lines = self._table._page._charlines_filtered(self.bbox)
84        return self._lines
content
86    @property
87    def content(self):
88        return "".join(c.char for line in self.lines for c in line.chars)
left_aligned
90    @property
91    def left_aligned(self):
92        x_em = self._table._page._spacing["x_em"]
93        for line in self.lines:
94            if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right):
95                return True
96        return False
ast
 98    @property
 99    def ast(self):
100        ast = self._table._page._ast_filtered(self.bbox, with_graphics=False,
101                                              ignore_xpos=not self.left_aligned,
102                                              with_bits=False, with_notes=False)
103        ast.name = "cell"
104        return ast
class TableCell.Borders:
15    class Borders:
16        def __init__(self, l, b, r, t):
17            self.l = l
18            self.b = b
19            self.r = r
20            self.t = t
TableCell.Borders(l, b, r, t)
16        def __init__(self, l, b, r, t):
17            self.l = l
18            self.b = b
19            self.r = r
20            self.t = t
l
b
r
t
class Table:
117class Table:
118    def __init__(self, page, bbox: Rectangle, xlines: list, ylines: list,
119                 cbbox: Rectangle = None, is_register: bool = False):
120        self._page = page
121        self._spacing = page._spacing
122        self.bbox = bbox
123        self.cbbox = None if is_register else cbbox
124        self._type = "table"
125        self._bit_headers = None
126
127        # Coalesce the vertical lines to detect the grid
128        def _cluster(lines, key):
129            atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4
130            grid = defaultdict(list)
131            last = -1e9
132            current = -1e9
133            for line in sorted(lines, key=key):
134                if (last + atol) < key(line):
135                    current = key(line)
136                grid[current].append(line)
137                last = key(line)
138            return grid
139        xgrid = _cluster(xlines, lambda l: l.p0.x)
140        ygrid = _cluster(ylines, lambda l: l.p0.y)
141
142        if is_register:
143            self._type = "register"
144
145            # Find the positions of the top numbers
146            clusters = []
147            if lines := self._page._charlines_filtered(cbbox):
148                if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)):
149                    clusters.append((cluster, cbbox))
150                else:
151                    self.grid = (0, 0)
152                    LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
153
154            # Find the positions of the second row of numbers
155            if len(ygrid) > 2:
156                for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])):
157                    nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y,
158                                      self.bbox.right, ygrid[ypos1][0].p0.y)
159                    if lines := self._page._charlines_filtered(nbbox):
160                        if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars):
161                            if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16:
162                                clusters.append((cluster, nbbox))
163                                self._bit_headers = len(ygrid) - yi - 1
164                            else:
165                                self.grid = (len(cluster), 0)
166                                LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})")
167                            break
168
169            # Merge these clusters to find their positions
170            for cluster, bbox in clusters:
171                # Close left and right side
172                xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top))
173                xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top))
174                # Now close the lines in between
175                for cleft, cright in zip(cluster, cluster[1:]):
176                    # find a line between the clusters
177                    xpos = next(((x, xgrid[x][0].p0.x) for x in sorted(xgrid)
178                                 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left), None)
179                    # Didn't find one, we must add one manually
180                    if xpos is None:
181                        xpos = (cleft.bbox.right + cright.bbox.left) / 2
182                        xpos = (int(round(xpos)), xpos)
183                    # Add it to the grid
184                    xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top))
185            # close the top
186            ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right))
187
188        # Fix the position keys properly
189        self._xgrid = {int(round(statistics.fmean(m.p0.x for m in l))): l
190                       for l in xgrid.values()}
191        self._ygrid = {int(round(statistics.fmean(m.p0.y for m in l))): l
192                       for l in ygrid.values()}
193        # Map the positions to integers
194        self._xpos = list(sorted(self._xgrid))
195        self._ypos = list(sorted(self._ygrid))
196
197        self.grid = (len(self._xpos) - 1, len(self._ypos) - 1)
198        self._cells = None
199
200    def _cell_borders(self, x: int, y: int, bbox: Rectangle,
201                      mask: int = 0b1111) -> tuple[int, int, int, int]:
202        # left, bottom, right, top
203        borders = [0, 0, 0, 0]
204        mp = bbox.midpoint
205        if mask & 0b1000:  # Left
206            for line in self._xgrid[self._xpos[x]]:
207                if line.p0.y < mp.y < line.p1.y:
208                    borders[0] = line.width
209                    assert line.width
210                    break
211        if mask & 0b0010:  # Right
212            for line in self._xgrid[self._xpos[x + 1]]:
213                if line.p0.y < mp.y < line.p1.y:
214                    borders[2] = line.width
215                    assert line.width
216                    break
217        if mask & 0b0100:  # Bottom
218            for line in self._ygrid[self._ypos[y]]:
219                if line.p0.x < mp.x < line.p1.x:
220                    borders[1] = line.width
221                    assert line.width
222                    break
223        if mask & 0b0001:  # Top
224            for line in self._ygrid[self._ypos[y + 1]]:
225                if line.p0.x < mp.x < line.p1.x:
226                    borders[3] = line.width
227                    assert line.width
228                    break
229
230        return TableCell.Borders(*borders)
231
232    def _fix_borders(self, cells, x: int, y: int):
233        # We are looking at the 9 neighbors around the cells
234        cell = cells[(x, y)]
235        c = cells[(x, y)].b
236        r = cells[(x + 1, y)].b if cells[(x + 1, y)] is not None else TableCell.Borders(0, 0, 1, 0)
237        t = cells[(x, y + 1)].b if cells[(x, y + 1)] is not None else TableCell.Borders(0, 1, 0, 0)
238
239        # if (not c.t and c.l and c.r and c.b) and "Reset value" in cell.content:
240        #     c.t = 1
241
242        # Open at the top into a span
243        if (not c.t and c.r) and (not t.r or not t.l):
244            c.t = 1
245            t.b = 1
246        # Open at the top and self is a span
247        if (not c.t and not c.r) and (t.r and t.l):
248            c.t = 1
249            t.b = 1
250
251        # Open to the right into a span
252        if (not c.r and c.t) and (not r.t or not r.b):
253            c.r = 1
254            r.l = 1
255        # Open to the right and self is a span
256        if (not c.r and not c.t) and (r.t and r.b):
257            c.r = 1
258            r.l = 1
259
260    @property
261    def cells(self) -> list[TableCell]:
262        if self._cells is None:
263            if self.grid < (1, 1):
264                self._cells = []
265                return self._cells
266
267            # First determine the spans of cells by checking the borders
268            cells = defaultdict(lambda: None)
269            for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])):
270                for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])):
271                    bbox = Rectangle(x0, y0, x1, y1)
272                    borders = self._cell_borders(xi, yi, bbox, 0b1111)
273                    cells[(xi, yi)] = TableCell(self, (self.grid[1] - 1 - yi, xi),
274                                                bbox, borders, self._type == "register")
275
276            # Fix table cell borders via consistency checks
277            for yi in range(self.grid[1]):
278                for xi in range(self.grid[0]):
279                    self._fix_borders(cells, xi, yi)
280
281            # Merge the cells recursively
282            def _merge(px, py, x, y):
283                if cells[(x, y)] is None:
284                    return
285                # print(cells[(x, y)])
286                # Right border is open
287                if not cells[(x, y)].b.r:
288                    if cells[(x + 1, y)] is not None:
289                        cells[(px, py)]._merge(cells[(x + 1, y)])
290                        _merge(px, py, x + 1, y)
291                        cells[(x + 1, y)] = None
292                # Top border is open
293                if not cells[(x, y)].b.t:
294                    if cells[(x, y + 1)] is not None:
295                        cells[(px, py)]._merge(cells[(x, y + 1)])
296                        _merge(px, py, x, y + 1)
297                        cells[(x, y + 1)] = None
298            # Start merging in bottom left cell
299            for yi in range(self.grid[1]):
300                for xi in range(self.grid[0]):
301                    _merge(xi, yi, xi, yi)
302
303            # Find the header line, it is thicker than normal
304            y_header_pos = self.grid[1]
305            if self._type != "register":
306                if self.grid[1] > 1:
307                    line_widths = {round(line.width, 1) for llist in self._ygrid.values()
308                                   for line in llist if line.width != 0.1} # magic width of virtual borders
309                    if line_widths:
310                        line_width_max = max(line_widths) * 0.9
311                        if min(line_widths) < line_width_max:
312                            # Find the first thick line starting from the top
313                            y_header_pos = next((yi for yi, ypos in reversed(list(enumerate(self._ypos)))
314                                                 if any(line.width > line_width_max for line in self._ygrid[ypos])),
315                                                y_header_pos)
316
317                # Map all the header
318                is_bold = []
319                for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]):
320                    bbox = None
321                    for xi in range(self.grid[0]):
322                        if (cell := cells[(xi, yi)]) is not None:
323                            if bbox is None:
324                                bbox = cell.bbox
325                            else:
326                                bbox = bbox.joined(cell.bbox)
327                    if bbox is None: continue
328                    chars = self._page.chars_in_area(bbox)
329                    is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1
330                    is_bold.append((yi, is_bold_pct > self._spacing["th"]))
331
332                # Some tables have no bold cells at all
333                if all(not b[1] for b in is_bold):
334                    # Special case for two row tables without bold headers, but a bold line inbetween
335                    if self.grid[1] == 2 and y_header_pos == 1: y_header_pos = 2
336                else:
337                    if y_header_pos < self.grid[1]:
338                        # Find the lowest bold row starting from bold line
339                        y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos)
340                    else:
341                        # Find the lowest bold row starting from the top
342                        for b in reversed(is_bold):
343                            if not b[1]: break
344                            y_header_pos = b[0]
345
346            # Tell the header cells
347            for yi in range(y_header_pos, self.grid[1]):
348                for xi in range(self.grid[0]):
349                    if (cell := cells[(xi, yi)]) is not None:
350                        cell.is_header = True
351
352            # Flatten into array
353            cells = [c for c in cells.values() if c is not None]
354
355            # Normalize cells for registers by moving the lower ones right and up
356            if self._type == "register" and self._bit_headers is not None:
357                for cell in cells:
358                    if cell.y >= self._bit_headers:
359                        cell._move(16, -self._bit_headers)
360                    elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1:
361                        cell._expand(0, 3 - self._bit_headers)
362                self.grid = (32, 4)
363
364            self._cells = list(sorted(cells, key=lambda c: c.positions[0]))
365
366        return self._cells
367
368    def append_bottom(self, other, merge_headers=True) -> bool:
369        debug = False
370        xgrid = self.grid[0]
371        if merge_headers and xgrid != other.grid[0]:
372            # Some tables have different column layouts due to span cells
373            # So we must correct the X positions of all cells accordingly
374            self_xheaders = defaultdict(set)
375            other_xheaders = defaultdict(set)
376            self_headers = [c for c in self.cells if c.is_header]
377            other_headers = [c for c in other.cells if c.is_header]
378            # Find the smallest set of spanning xpositions based on the header cells
379            for xpos in range(self.grid[0]):
380                for hcell in self_headers:
381                    if any(p[1] == xpos for p in hcell.positions):
382                        self_xheaders[hcell.x].add(xpos)
383            for xpos in range(other.grid[0]):
384                for hcell in other_headers:
385                    if any(p[1] == xpos for p in hcell.positions):
386                        other_xheaders[hcell.x].add(xpos)
387
388            # Compute the shared
389            self_heads = sorted(self_xheaders.keys())
390            other_heads = sorted(other_xheaders.keys())
391            xgrid = 0
392            merged_xheaders = defaultdict(set)
393            # Zip the groups together, these represent the matching header group spans
394            for self_xhead, other_xhead in zip(self_heads, other_heads):
395                size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead]))
396                merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size))
397                xgrid += size
398
399            if debug:
400                print(len(self_xheaders), self_xheaders)
401                print(len(other_xheaders), other_xheaders)
402                print(len(merged_xheaders), merged_xheaders)
403            # If they are not equal length the table layouts are not compatible at all!
404            if len(self_heads) != len(other_heads):
405                LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
406                return False
407
408            # We want to stuff/move the cell positions inplace, therefore we start
409            # backwards moving the high numbers even higher, so that we don't
410            # overwrite ourselves and get stuck in an infinite loop
411            # Zip the groups together, these represent the matching header group spans
412            for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)):
413                merged_xhead = max(self_xhead, other_xhead)
414                self_xpos = sorted(self_xheaders[self_xhead], reverse=True)
415                other_xpos = sorted(other_xheaders[other_xhead], reverse=True)
416                merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True)
417
418                def _insert_cells(cell, src, dsts, insert_only):
419                    assert dsts
420                    new_positions = []
421                    any_change = False
422                    for cpos in reversed(cell.positions):
423                        if insert_only:
424                            # If our set is empty we must only insert positions
425                            if cpos[1] == src:
426                                for xpos in dsts:
427                                    if debug:
428                                        print(f"Insert {cpos}++{(cpos[0], xpos)}")
429                                    new_positions.append((cpos[0], xpos))
430                                    any_change = True
431                            new_positions.append(cpos)
432                        else:
433                            # We must move (=replace and add) the span positions
434                            if cpos[1] == src:
435                                if debug:
436                                    print(f"Move {cpos}->{(cpos[0], dsts[0])}")
437                                new_positions.append((cpos[0], dsts[0]))
438                                any_change = True
439                            else:
440                                new_positions.append(cpos)
441                    if debug and any_change:
442                        print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}")
443                        print("old=", cell.positions, "new=", sorted(new_positions))
444                        print()
445                    assert new_positions
446                    assert len(new_positions) == len(set(new_positions))
447                    cell.positions = sorted(new_positions)
448
449                def _move_cells(cells, own_xpos):
450                    if debug:
451                        print()
452                        print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======")
453                        print()
454
455                    for ii in range(max(len(own_xpos), len(merged_xpos))):
456                        insert_only = ii >= len(own_xpos)
457                        if insert_only:
458                            src = merged_xpos[ii - 1]
459                            dsts = merged_xpos[ii:]
460                            if debug: print(f"{src}->{dsts} I")
461                            for cell in cells:
462                                _insert_cells(cell, src, dsts, True)
463                            break
464                        else:
465                            src = own_xpos[ii]
466                            dsts = merged_xpos[ii:ii + 1]
467                            if debug: print(f"{src}->{dsts} M")
468                            for cell in cells:
469                                _insert_cells(cell, src, dsts, False)
470
471                if debug: print()
472                if self_xpos != merged_xpos:
473                    if debug:
474                        print(f"====== Self:  x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}")
475                    _move_cells(self.cells, self_xpos)
476                if other_xpos != merged_xpos:
477                    if debug:
478                        print(f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}")
479                    _move_cells(other.cells, other_xpos)
480            if debug:
481                print()
482                print()
483                print()
484
485        # We must move the cells downwards now, but minus the header rows
486        rows = self.grid[1] - other.header_rows
487        for cell in other.cells:
488            # Discard the header cells, we just assume they are the same
489            if not cell.is_header:
490                cell._move(0, rows)
491                self.cells.append(cell)
492        self.cells.sort(key=lambda c: c.positions[0])
493        self.grid = (xgrid, other.grid[1] + rows)
494        if debug:
495            print(f"{self._page} -> {self.grid}")
496        return True
497
498    def append_side(self, other, expand=False) -> bool:
499        if self.grid[1] != other.grid[1]:
500            if expand:
501                LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})")
502                ymin = min(self.grid[1], other.grid[1])
503                ymax = max(self.grid[1], other.grid[1])
504                etable = other if self.grid[1] > other.grid[1] else self
505                for cell in etable.cells:
506                    if any(p[0] == ymin - 1 for p in cell.positions):
507                        cell._expand(0, ymax - ymin)
508                etable.grid = (etable.grid[0], ymax)
509            else:
510                LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
511                return False
512
513        # We must move all cells to the right now
514        columns = self.grid[0]
515        for cell in other.cells:
516            cell._move(columns, 0)
517            self.cells.append(cell)
518        self.cells.sort(key=lambda c: c.positions[0])
519        self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1]))
520        return True
521
522    @cached_property
523    def header_rows(self) -> int:
524        header_cells = [c for c in self.cells if c.is_header]
525        if header_cells:
526            return max(c.positions[-1][0] + 1 for c in header_cells)
527        return 0
528
529    def __repr__(self) -> str:
530        return f"Table({self.grid[0]}x{self.grid[1]})"
Table( page, bbox: modm_data.utils.math.Rectangle, xlines: list, ylines: list, cbbox: modm_data.utils.math.Rectangle = None, is_register: bool = False)
118    def __init__(self, page, bbox: Rectangle, xlines: list, ylines: list,
119                 cbbox: Rectangle = None, is_register: bool = False):
120        self._page = page
121        self._spacing = page._spacing
122        self.bbox = bbox
123        self.cbbox = None if is_register else cbbox
124        self._type = "table"
125        self._bit_headers = None
126
127        # Coalesce the vertical lines to detect the grid
128        def _cluster(lines, key):
129            atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4
130            grid = defaultdict(list)
131            last = -1e9
132            current = -1e9
133            for line in sorted(lines, key=key):
134                if (last + atol) < key(line):
135                    current = key(line)
136                grid[current].append(line)
137                last = key(line)
138            return grid
139        xgrid = _cluster(xlines, lambda l: l.p0.x)
140        ygrid = _cluster(ylines, lambda l: l.p0.y)
141
142        if is_register:
143            self._type = "register"
144
145            # Find the positions of the top numbers
146            clusters = []
147            if lines := self._page._charlines_filtered(cbbox):
148                if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)):
149                    clusters.append((cluster, cbbox))
150                else:
151                    self.grid = (0, 0)
152                    LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
153
154            # Find the positions of the second row of numbers
155            if len(ygrid) > 2:
156                for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])):
157                    nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y,
158                                      self.bbox.right, ygrid[ypos1][0].p0.y)
159                    if lines := self._page._charlines_filtered(nbbox):
160                        if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars):
161                            if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16:
162                                clusters.append((cluster, nbbox))
163                                self._bit_headers = len(ygrid) - yi - 1
164                            else:
165                                self.grid = (len(cluster), 0)
166                                LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})")
167                            break
168
169            # Merge these clusters to find their positions
170            for cluster, bbox in clusters:
171                # Close left and right side
172                xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top))
173                xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top))
174                # Now close the lines in between
175                for cleft, cright in zip(cluster, cluster[1:]):
176                    # find a line between the clusters
177                    xpos = next(((x, xgrid[x][0].p0.x) for x in sorted(xgrid)
178                                 if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left), None)
179                    # Didn't find one, we must add one manually
180                    if xpos is None:
181                        xpos = (cleft.bbox.right + cright.bbox.left) / 2
182                        xpos = (int(round(xpos)), xpos)
183                    # Add it to the grid
184                    xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top))
185            # close the top
186            ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right))
187
188        # Fix the position keys properly
189        self._xgrid = {int(round(statistics.fmean(m.p0.x for m in l))): l
190                       for l in xgrid.values()}
191        self._ygrid = {int(round(statistics.fmean(m.p0.y for m in l))): l
192                       for l in ygrid.values()}
193        # Map the positions to integers
194        self._xpos = list(sorted(self._xgrid))
195        self._ypos = list(sorted(self._ygrid))
196
197        self.grid = (len(self._xpos) - 1, len(self._ypos) - 1)
198        self._cells = None
bbox
cbbox
grid
cells: list[TableCell]
260    @property
261    def cells(self) -> list[TableCell]:
262        if self._cells is None:
263            if self.grid < (1, 1):
264                self._cells = []
265                return self._cells
266
267            # First determine the spans of cells by checking the borders
268            cells = defaultdict(lambda: None)
269            for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])):
270                for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])):
271                    bbox = Rectangle(x0, y0, x1, y1)
272                    borders = self._cell_borders(xi, yi, bbox, 0b1111)
273                    cells[(xi, yi)] = TableCell(self, (self.grid[1] - 1 - yi, xi),
274                                                bbox, borders, self._type == "register")
275
276            # Fix table cell borders via consistency checks
277            for yi in range(self.grid[1]):
278                for xi in range(self.grid[0]):
279                    self._fix_borders(cells, xi, yi)
280
281            # Merge the cells recursively
282            def _merge(px, py, x, y):
283                if cells[(x, y)] is None:
284                    return
285                # print(cells[(x, y)])
286                # Right border is open
287                if not cells[(x, y)].b.r:
288                    if cells[(x + 1, y)] is not None:
289                        cells[(px, py)]._merge(cells[(x + 1, y)])
290                        _merge(px, py, x + 1, y)
291                        cells[(x + 1, y)] = None
292                # Top border is open
293                if not cells[(x, y)].b.t:
294                    if cells[(x, y + 1)] is not None:
295                        cells[(px, py)]._merge(cells[(x, y + 1)])
296                        _merge(px, py, x, y + 1)
297                        cells[(x, y + 1)] = None
298            # Start merging in bottom left cell
299            for yi in range(self.grid[1]):
300                for xi in range(self.grid[0]):
301                    _merge(xi, yi, xi, yi)
302
303            # Find the header line, it is thicker than normal
304            y_header_pos = self.grid[1]
305            if self._type != "register":
306                if self.grid[1] > 1:
307                    line_widths = {round(line.width, 1) for llist in self._ygrid.values()
308                                   for line in llist if line.width != 0.1} # magic width of virtual borders
309                    if line_widths:
310                        line_width_max = max(line_widths) * 0.9
311                        if min(line_widths) < line_width_max:
312                            # Find the first thick line starting from the top
313                            y_header_pos = next((yi for yi, ypos in reversed(list(enumerate(self._ypos)))
314                                                 if any(line.width > line_width_max for line in self._ygrid[ypos])),
315                                                y_header_pos)
316
317                # Map all the header
318                is_bold = []
319                for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]):
320                    bbox = None
321                    for xi in range(self.grid[0]):
322                        if (cell := cells[(xi, yi)]) is not None:
323                            if bbox is None:
324                                bbox = cell.bbox
325                            else:
326                                bbox = bbox.joined(cell.bbox)
327                    if bbox is None: continue
328                    chars = self._page.chars_in_area(bbox)
329                    is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1
330                    is_bold.append((yi, is_bold_pct > self._spacing["th"]))
331
332                # Some tables have no bold cells at all
333                if all(not b[1] for b in is_bold):
334                    # Special case for two row tables without bold headers, but a bold line inbetween
335                    if self.grid[1] == 2 and y_header_pos == 1: y_header_pos = 2
336                else:
337                    if y_header_pos < self.grid[1]:
338                        # Find the lowest bold row starting from bold line
339                        y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos)
340                    else:
341                        # Find the lowest bold row starting from the top
342                        for b in reversed(is_bold):
343                            if not b[1]: break
344                            y_header_pos = b[0]
345
346            # Tell the header cells
347            for yi in range(y_header_pos, self.grid[1]):
348                for xi in range(self.grid[0]):
349                    if (cell := cells[(xi, yi)]) is not None:
350                        cell.is_header = True
351
352            # Flatten into array
353            cells = [c for c in cells.values() if c is not None]
354
355            # Normalize cells for registers by moving the lower ones right and up
356            if self._type == "register" and self._bit_headers is not None:
357                for cell in cells:
358                    if cell.y >= self._bit_headers:
359                        cell._move(16, -self._bit_headers)
360                    elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1:
361                        cell._expand(0, 3 - self._bit_headers)
362                self.grid = (32, 4)
363
364            self._cells = list(sorted(cells, key=lambda c: c.positions[0]))
365
366        return self._cells
def append_bottom(self, other, merge_headers=True) -> bool:
368    def append_bottom(self, other, merge_headers=True) -> bool:
369        debug = False
370        xgrid = self.grid[0]
371        if merge_headers and xgrid != other.grid[0]:
372            # Some tables have different column layouts due to span cells
373            # So we must correct the X positions of all cells accordingly
374            self_xheaders = defaultdict(set)
375            other_xheaders = defaultdict(set)
376            self_headers = [c for c in self.cells if c.is_header]
377            other_headers = [c for c in other.cells if c.is_header]
378            # Find the smallest set of spanning xpositions based on the header cells
379            for xpos in range(self.grid[0]):
380                for hcell in self_headers:
381                    if any(p[1] == xpos for p in hcell.positions):
382                        self_xheaders[hcell.x].add(xpos)
383            for xpos in range(other.grid[0]):
384                for hcell in other_headers:
385                    if any(p[1] == xpos for p in hcell.positions):
386                        other_xheaders[hcell.x].add(xpos)
387
388            # Compute the shared
389            self_heads = sorted(self_xheaders.keys())
390            other_heads = sorted(other_xheaders.keys())
391            xgrid = 0
392            merged_xheaders = defaultdict(set)
393            # Zip the groups together, these represent the matching header group spans
394            for self_xhead, other_xhead in zip(self_heads, other_heads):
395                size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead]))
396                merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size))
397                xgrid += size
398
399            if debug:
400                print(len(self_xheaders), self_xheaders)
401                print(len(other_xheaders), other_xheaders)
402                print(len(merged_xheaders), merged_xheaders)
403            # If they are not equal length the table layouts are not compatible at all!
404            if len(self_heads) != len(other_heads):
405                LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
406                return False
407
408            # We want to stuff/move the cell positions inplace, therefore we start
409            # backwards moving the high numbers even higher, so that we don't
410            # overwrite ourselves and get stuck in an infinite loop
411            # Zip the groups together, these represent the matching header group spans
412            for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)):
413                merged_xhead = max(self_xhead, other_xhead)
414                self_xpos = sorted(self_xheaders[self_xhead], reverse=True)
415                other_xpos = sorted(other_xheaders[other_xhead], reverse=True)
416                merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True)
417
418                def _insert_cells(cell, src, dsts, insert_only):
419                    assert dsts
420                    new_positions = []
421                    any_change = False
422                    for cpos in reversed(cell.positions):
423                        if insert_only:
424                            # If our set is empty we must only insert positions
425                            if cpos[1] == src:
426                                for xpos in dsts:
427                                    if debug:
428                                        print(f"Insert {cpos}++{(cpos[0], xpos)}")
429                                    new_positions.append((cpos[0], xpos))
430                                    any_change = True
431                            new_positions.append(cpos)
432                        else:
433                            # We must move (=replace and add) the span positions
434                            if cpos[1] == src:
435                                if debug:
436                                    print(f"Move {cpos}->{(cpos[0], dsts[0])}")
437                                new_positions.append((cpos[0], dsts[0]))
438                                any_change = True
439                            else:
440                                new_positions.append(cpos)
441                    if debug and any_change:
442                        print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}")
443                        print("old=", cell.positions, "new=", sorted(new_positions))
444                        print()
445                    assert new_positions
446                    assert len(new_positions) == len(set(new_positions))
447                    cell.positions = sorted(new_positions)
448
449                def _move_cells(cells, own_xpos):
450                    if debug:
451                        print()
452                        print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======")
453                        print()
454
455                    for ii in range(max(len(own_xpos), len(merged_xpos))):
456                        insert_only = ii >= len(own_xpos)
457                        if insert_only:
458                            src = merged_xpos[ii - 1]
459                            dsts = merged_xpos[ii:]
460                            if debug: print(f"{src}->{dsts} I")
461                            for cell in cells:
462                                _insert_cells(cell, src, dsts, True)
463                            break
464                        else:
465                            src = own_xpos[ii]
466                            dsts = merged_xpos[ii:ii + 1]
467                            if debug: print(f"{src}->{dsts} M")
468                            for cell in cells:
469                                _insert_cells(cell, src, dsts, False)
470
471                if debug: print()
472                if self_xpos != merged_xpos:
473                    if debug:
474                        print(f"====== Self:  x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}")
475                    _move_cells(self.cells, self_xpos)
476                if other_xpos != merged_xpos:
477                    if debug:
478                        print(f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}")
479                    _move_cells(other.cells, other_xpos)
480            if debug:
481                print()
482                print()
483                print()
484
485        # We must move the cells downwards now, but minus the header rows
486        rows = self.grid[1] - other.header_rows
487        for cell in other.cells:
488            # Discard the header cells, we just assume they are the same
489            if not cell.is_header:
490                cell._move(0, rows)
491                self.cells.append(cell)
492        self.cells.sort(key=lambda c: c.positions[0])
493        self.grid = (xgrid, other.grid[1] + rows)
494        if debug:
495            print(f"{self._page} -> {self.grid}")
496        return True
def append_side(self, other, expand=False) -> bool:
498    def append_side(self, other, expand=False) -> bool:
499        if self.grid[1] != other.grid[1]:
500            if expand:
501                LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})")
502                ymin = min(self.grid[1], other.grid[1])
503                ymax = max(self.grid[1], other.grid[1])
504                etable = other if self.grid[1] > other.grid[1] else self
505                for cell in etable.cells:
506                    if any(p[0] == ymin - 1 for p in cell.positions):
507                        cell._expand(0, ymax - ymin)
508                etable.grid = (etable.grid[0], ymax)
509            else:
510                LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
511                return False
512
513        # We must move all cells to the right now
514        columns = self.grid[0]
515        for cell in other.cells:
516            cell._move(columns, 0)
517            self.cells.append(cell)
518        self.cells.sort(key=lambda c: c.positions[0])
519        self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1]))
520        return True
header_rows: int
522    @cached_property
523    def header_rows(self) -> int:
524        header_cells = [c for c in self.cells if c.is_header]
525        if header_cells:
526            return max(c.positions[-1][0] + 1 for c in header_cells)
527        return 0
class VirtualTable(Table):
533class VirtualTable(Table):
534    def __init__(self, page, bbox, cells, table_type=None):
535        self._page = page
536        self._spacing = page._spacing
537        self._type = table_type or "virtual"
538        self.bbox = bbox
539        self._cells = cells
540        self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1)
541        for cell in cells:
542            cell._table = self
543
544    def __repr__(self) -> str:
545        return f"VTable({self.grid[0]}x{self.grid[1]})"
VirtualTable(page, bbox, cells, table_type=None)
534    def __init__(self, page, bbox, cells, table_type=None):
535        self._page = page
536        self._spacing = page._spacing
537        self._type = table_type or "virtual"
538        self.bbox = bbox
539        self._cells = cells
540        self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1)
541        for cell in cells:
542            cell._table = self
bbox
grid