modm_data.pdf2html.table

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4import logging
  5import statistics
  6from functools import cached_property
  7from collections import defaultdict
  8from ..utils import HLine, VLine, Rectangle
  9from .cell import Cell, Borders
 10
 11_LOGGER = logging.getLogger(__name__)
 12
 13
 14class Table:
 15    def __init__(
 16        self, page, bbox: Rectangle, xlines: list, ylines: list, cbbox: Rectangle = None, is_register: bool = False
 17    ):
 18        self._page = page
 19        self._spacing = page._spacing
 20        self.bbox = bbox
 21        self.cbbox = None if is_register else cbbox
 22        self._type = "table"
 23        self._bit_headers = None
 24
 25        # Coalesce the vertical lines to detect the grid
 26        def _cluster(lines, key):
 27            atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4
 28            grid = defaultdict(list)
 29            last = -1e9
 30            current = -1e9
 31            for line in sorted(lines, key=key):
 32                if (last + atol) < key(line):
 33                    current = key(line)
 34                grid[current].append(line)
 35                last = key(line)
 36            return grid
 37
 38        xgrid = _cluster(xlines, lambda line: line.p0.x)
 39        ygrid = _cluster(ylines, lambda line: line.p0.y)
 40
 41        if is_register:
 42            self._type = "register"
 43
 44            # Find the positions of the top numbers
 45            clusters = []
 46            if lines := self._page.charlines_in_area(cbbox):
 47                if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)):
 48                    clusters.append((cluster, cbbox))
 49                else:
 50                    self.grid = (0, 0)
 51                    _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
 52
 53            # Find the positions of the second row of numbers
 54            if len(ygrid) > 2:
 55                for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])):
 56                    nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, self.bbox.right, ygrid[ypos1][0].p0.y)
 57                    if lines := self._page.charlines_in_area(nbbox):
 58                        if all(c.char.isnumeric() or c.unicode in {0x20, 0xA, 0xD} for c in lines[0].chars):
 59                            if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16:
 60                                clusters.append((cluster, nbbox))
 61                                self._bit_headers = len(ygrid) - yi - 1
 62                            else:
 63                                self.grid = (len(cluster), 0)
 64                                _LOGGER.warning(
 65                                    f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})"
 66                                )
 67                            break
 68
 69            # Merge these clusters to find their positions
 70            for cluster, bbox in clusters:
 71                # Close left and right side
 72                xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top))
 73                xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top))
 74                # Now close the lines in between
 75                for cleft, cright in zip(cluster, cluster[1:]):
 76                    # find a line between the clusters
 77                    xpos = next(
 78                        (
 79                            (x, xgrid[x][0].p0.x)
 80                            for x in sorted(xgrid)
 81                            if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left
 82                        ),
 83                        None,
 84                    )
 85                    # Didn't find one, we must add one manually
 86                    if xpos is None:
 87                        xpos = (cleft.bbox.right + cright.bbox.left) / 2
 88                        xpos = (int(round(xpos)), xpos)
 89                    # Add it to the grid
 90                    xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top))
 91            # close the top
 92            ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right))
 93
 94        # Fix the position keys properly
 95        self._xgrid = {int(round(statistics.fmean(m.p0.x for m in line))): line for line in xgrid.values()}
 96        self._ygrid = {int(round(statistics.fmean(m.p0.y for m in line))): line for line in ygrid.values()}
 97        # Map the positions to integers
 98        self._xpos = list(sorted(self._xgrid))
 99        self._ypos = list(sorted(self._ygrid))
100
101        self.grid = (len(self._xpos) - 1, len(self._ypos) - 1)
102        self._cells = None
103
104    def _cell_borders(self, x: int, y: int, bbox: Rectangle, mask: int = 0b1111) -> tuple[int, int, int, int]:
105        # left, bottom, right, top
106        borders = [0, 0, 0, 0]
107        mp = bbox.midpoint
108        if mask & 0b1000:  # Left
109            for line in self._xgrid[self._xpos[x]]:
110                if line.p0.y < mp.y < line.p1.y:
111                    borders[0] = line.width
112                    assert line.width
113                    break
114        if mask & 0b0010:  # Right
115            for line in self._xgrid[self._xpos[x + 1]]:
116                if line.p0.y < mp.y < line.p1.y:
117                    borders[2] = line.width
118                    assert line.width
119                    break
120        if mask & 0b0100:  # Bottom
121            for line in self._ygrid[self._ypos[y]]:
122                if line.p0.x < mp.x < line.p1.x:
123                    borders[1] = line.width
124                    assert line.width
125                    break
126        if mask & 0b0001:  # Top
127            for line in self._ygrid[self._ypos[y + 1]]:
128                if line.p0.x < mp.x < line.p1.x:
129                    borders[3] = line.width
130                    assert line.width
131                    break
132
133        return Borders(*borders)
134
135    def _fix_borders(self, cells, x: int, y: int):
136        # We are looking at the 9 neighbors around the cells
137        c = cells[(x, y)].borders
138        r = cells[(x + 1, y)].borders if cells[(x + 1, y)] is not None else Borders(0, 0, 1, 0)
139        t = cells[(x, y + 1)].borders if cells[(x, y + 1)] is not None else Borders(0, 1, 0, 0)
140
141        # if (not c.top and c.left and c.right and c.bottom) and "Reset value" in cell.content:
142        #     c.top = 1
143
144        # Open at the top into a span
145        if (not c.top and c.right) and (not t.right or not t.left):
146            c.top = 1
147            t.bottom = 1
148        # Open at the top and self is a span
149        if (not c.top and not c.right) and (t.right and t.left):
150            c.top = 1
151            t.bottom = 1
152
153        # Open to the right into a span
154        if (not c.right and c.top) and (not r.top or not r.bottom):
155            c.right = 1
156            r.left = 1
157        # Open to the right and self is a span
158        if (not c.right and not c.top) and (r.top and r.bottom):
159            c.right = 1
160            r.left = 1
161
162    @property
163    def cells(self) -> list[Cell]:
164        if self._cells is None:
165            if self.grid < (1, 1):
166                self._cells = []
167                return self._cells
168
169            # First determine the spans of cells by checking the borders
170            cells = defaultdict(lambda: None)
171            for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])):
172                for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])):
173                    bbox = Rectangle(x0, y0, x1, y1)
174                    borders = self._cell_borders(xi, yi, bbox, 0b1111)
175                    cells[(xi, yi)] = Cell(self, (self.grid[1] - 1 - yi, xi), bbox, borders, self._type == "register")
176
177            # Fix table cell borders via consistency checks
178            for yi in range(self.grid[1]):
179                for xi in range(self.grid[0]):
180                    self._fix_borders(cells, xi, yi)
181
182            # Merge the cells recursively
183            def _merge(px, py, x, y):
184                if cells[(x, y)] is None:
185                    return
186                # print(cells[(x, y)])
187                # Right border is open
188                if not cells[(x, y)].borders.right:
189                    if cells[(x + 1, y)] is not None:
190                        cells[(px, py)]._merge(cells[(x + 1, y)])
191                        _merge(px, py, x + 1, y)
192                        cells[(x + 1, y)] = None
193                # Top border is open
194                if not cells[(x, y)].borders.top:
195                    if cells[(x, y + 1)] is not None:
196                        cells[(px, py)]._merge(cells[(x, y + 1)])
197                        _merge(px, py, x, y + 1)
198                        cells[(x, y + 1)] = None
199
200            # Start merging in bottom left cell
201            for yi in range(self.grid[1]):
202                for xi in range(self.grid[0]):
203                    _merge(xi, yi, xi, yi)
204
205            # Find the header line, it is thicker than normal
206            y_header_pos = self.grid[1]
207            if self._type != "register":
208                if self.grid[1] > 1:
209                    line_widths = {
210                        round(line.width, 1) for llist in self._ygrid.values() for line in llist if line.width != 0.1
211                    }  # magic width of virtual borders
212                    if line_widths:
213                        line_width_max = max(line_widths) * 0.9
214                        if min(line_widths) < line_width_max:
215                            # Find the first thick line starting from the top
216                            y_header_pos = next(
217                                (
218                                    yi
219                                    for yi, ypos in reversed(list(enumerate(self._ypos)))
220                                    if any(line.width > line_width_max for line in self._ygrid[ypos])
221                                ),
222                                y_header_pos,
223                            )
224
225                # Map all the header
226                is_bold = []
227                for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]):
228                    bbox = None
229                    for xi in range(self.grid[0]):
230                        if (cell := cells[(xi, yi)]) is not None:
231                            if bbox is None:
232                                bbox = cell.bbox
233                            else:
234                                bbox = bbox.joined(cell.bbox)
235                    if bbox is None:
236                        continue
237                    chars = self._page.chars_in_area(bbox)
238                    is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1
239                    is_bold.append((yi, is_bold_pct > self._spacing["th"]))
240
241                # Some tables have no bold cells at all
242                if all(not b[1] for b in is_bold):
243                    # Special case for two row tables without bold headers, but a bold line inbetween
244                    if self.grid[1] == 2 and y_header_pos == 1:
245                        y_header_pos = 2
246                else:
247                    if y_header_pos < self.grid[1]:
248                        # Find the lowest bold row starting from bold line
249                        y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos)
250                    else:
251                        # Find the lowest bold row starting from the top
252                        for b in reversed(is_bold):
253                            if not b[1]:
254                                break
255                            y_header_pos = b[0]
256
257            # Tell the header cells
258            for yi in range(y_header_pos, self.grid[1]):
259                for xi in range(self.grid[0]):
260                    if (cell := cells[(xi, yi)]) is not None:
261                        cell.is_header = True
262
263            # Flatten into array
264            cells = [c for c in cells.values() if c is not None]
265
266            # Normalize cells for registers by moving the lower ones right and up
267            if self._type == "register" and self._bit_headers is not None:
268                for cell in cells:
269                    if cell.y >= self._bit_headers:
270                        cell._move(16, -self._bit_headers)
271                    elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1:
272                        cell._expand(0, 3 - self._bit_headers)
273                self.grid = (32, 4)
274
275            self._cells = list(sorted(cells, key=lambda c: c.positions[0]))
276
277        return self._cells
278
279    def append_bottom(self, other, merge_headers=True) -> bool:
280        debug = False
281        xgrid = self.grid[0]
282        if merge_headers and xgrid != other.grid[0]:
283            # Some tables have different column layouts due to span cells
284            # So we must correct the X positions of all cells accordingly
285            self_xheaders = defaultdict(set)
286            other_xheaders = defaultdict(set)
287            self_headers = [c for c in self.cells if c.is_header]
288            other_headers = [c for c in other.cells if c.is_header]
289            # Find the smallest set of spanning xpositions based on the header cells
290            for xpos in range(self.grid[0]):
291                for hcell in self_headers:
292                    if any(p[1] == xpos for p in hcell.positions):
293                        self_xheaders[hcell.x].add(xpos)
294            for xpos in range(other.grid[0]):
295                for hcell in other_headers:
296                    if any(p[1] == xpos for p in hcell.positions):
297                        other_xheaders[hcell.x].add(xpos)
298
299            # Compute the shared
300            self_heads = sorted(self_xheaders.keys())
301            other_heads = sorted(other_xheaders.keys())
302            xgrid = 0
303            merged_xheaders = defaultdict(set)
304            # Zip the groups together, these represent the matching header group spans
305            for self_xhead, other_xhead in zip(self_heads, other_heads):
306                size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead]))
307                merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size))
308                xgrid += size
309
310            if debug:
311                print(len(self_xheaders), self_xheaders)
312                print(len(other_xheaders), other_xheaders)
313                print(len(merged_xheaders), merged_xheaders)
314            # If they are not equal length the table layouts are not compatible at all!
315            if len(self_heads) != len(other_heads):
316                _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
317                return False
318
319            # We want to stuff/move the cell positions inplace, therefore we start
320            # backwards moving the high numbers even higher, so that we don't
321            # overwrite ourselves and get stuck in an infinite loop
322            # Zip the groups together, these represent the matching header group spans
323            for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)):
324                merged_xhead = max(self_xhead, other_xhead)
325                self_xpos = sorted(self_xheaders[self_xhead], reverse=True)
326                other_xpos = sorted(other_xheaders[other_xhead], reverse=True)
327                merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True)
328
329                def _insert_cells(cell, src, dsts, insert_only):
330                    assert dsts
331                    new_positions = []
332                    any_change = False
333                    for cpos in reversed(cell.positions):
334                        if insert_only:
335                            # If our set is empty we must only insert positions
336                            if cpos[1] == src:
337                                for xpos in dsts:
338                                    if debug:
339                                        print(f"Insert {cpos}++{(cpos[0], xpos)}")
340                                    new_positions.append((cpos[0], xpos))
341                                    any_change = True
342                            new_positions.append(cpos)
343                        else:
344                            # We must move (=replace and add) the span positions
345                            if cpos[1] == src:
346                                if debug:
347                                    print(f"Move {cpos}->{(cpos[0], dsts[0])}")
348                                new_positions.append((cpos[0], dsts[0]))
349                                any_change = True
350                            else:
351                                new_positions.append(cpos)
352                    if debug and any_change:
353                        print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}")
354                        print("old=", cell.positions, "new=", sorted(new_positions))
355                        print()
356                    assert new_positions
357                    assert len(new_positions) == len(set(new_positions))
358                    cell.positions = sorted(new_positions)
359                    cell._invalidate()
360
361                def _move_cells(cells, own_xpos):
362                    if debug:
363                        print()
364                        print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======")
365                        print()
366
367                    for ii in range(max(len(own_xpos), len(merged_xpos))):
368                        insert_only = ii >= len(own_xpos)
369                        if insert_only:
370                            src = merged_xpos[ii - 1]
371                            dsts = merged_xpos[ii:]
372                            if debug:
373                                print(f"{src}->{dsts} I")
374                            for cell in cells:
375                                _insert_cells(cell, src, dsts, True)
376                            break
377                        else:
378                            src = own_xpos[ii]
379                            dsts = merged_xpos[ii : ii + 1]
380                            if debug:
381                                print(f"{src}->{dsts} M")
382                            for cell in cells:
383                                _insert_cells(cell, src, dsts, False)
384
385                if debug:
386                    print()
387                if self_xpos != merged_xpos:
388                    if debug:
389                        print(f"====== Self:  x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}")
390                    _move_cells(self.cells, self_xpos)
391                if other_xpos != merged_xpos:
392                    if debug:
393                        print(
394                            f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}"
395                        )
396                    _move_cells(other.cells, other_xpos)
397            if debug:
398                print()
399                print()
400                print()
401
402        # We must move the cells downwards now, but minus the header rows
403        rows = self.grid[1] - other.header_rows
404        for cell in other.cells:
405            # Discard the header cells, we just assume they are the same
406            if not cell.is_header:
407                cell._move(0, rows)
408                self.cells.append(cell)
409        self.cells.sort(key=lambda c: c.positions[0])
410        self.grid = (xgrid, other.grid[1] + rows)
411        if debug:
412            print(f"{self._page} -> {self.grid}")
413        return True
414
415    def append_side(self, other, expand=False) -> bool:
416        if self.grid[1] != other.grid[1]:
417            if expand:
418                _LOGGER.debug(
419                    f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})"
420                )
421                ymin = min(self.grid[1], other.grid[1])
422                ymax = max(self.grid[1], other.grid[1])
423                etable = other if self.grid[1] > other.grid[1] else self
424                for cell in etable.cells:
425                    if any(p[0] == ymin - 1 for p in cell.positions):
426                        cell._expand(0, ymax - ymin)
427                etable.grid = (etable.grid[0], ymax)
428            else:
429                _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
430                return False
431
432        # We must move all cells to the right now
433        columns = self.grid[0]
434        for cell in other.cells:
435            cell._move(columns, 0)
436            self.cells.append(cell)
437        self.cells.sort(key=lambda c: c.positions[0])
438        self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1]))
439        return True
440
441    @cached_property
442    def header_rows(self) -> int:
443        header_cells = [c for c in self.cells if c.is_header]
444        if header_cells:
445            return max(c.positions[-1][0] + 1 for c in header_cells)
446        return 0
447
448    def __repr__(self) -> str:
449        return f"Table({self.grid[0]}x{self.grid[1]})"
450
451
452class VirtualTable(Table):
453    def __init__(self, page, bbox, cells, table_type=None):
454        self._page = page
455        self._spacing = page._spacing
456        self._type = table_type or "virtual"
457        self.bbox = bbox
458        self._cells = cells
459        self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1)
460        for cell in cells:
461            cell._table = self
462
463    def __repr__(self) -> str:
464        return f"VTable({self.grid[0]}x{self.grid[1]})"
class Table:
 15class Table:
 16    def __init__(
 17        self, page, bbox: Rectangle, xlines: list, ylines: list, cbbox: Rectangle = None, is_register: bool = False
 18    ):
 19        self._page = page
 20        self._spacing = page._spacing
 21        self.bbox = bbox
 22        self.cbbox = None if is_register else cbbox
 23        self._type = "table"
 24        self._bit_headers = None
 25
 26        # Coalesce the vertical lines to detect the grid
 27        def _cluster(lines, key):
 28            atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4
 29            grid = defaultdict(list)
 30            last = -1e9
 31            current = -1e9
 32            for line in sorted(lines, key=key):
 33                if (last + atol) < key(line):
 34                    current = key(line)
 35                grid[current].append(line)
 36                last = key(line)
 37            return grid
 38
 39        xgrid = _cluster(xlines, lambda line: line.p0.x)
 40        ygrid = _cluster(ylines, lambda line: line.p0.y)
 41
 42        if is_register:
 43            self._type = "register"
 44
 45            # Find the positions of the top numbers
 46            clusters = []
 47            if lines := self._page.charlines_in_area(cbbox):
 48                if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)):
 49                    clusters.append((cluster, cbbox))
 50                else:
 51                    self.grid = (0, 0)
 52                    _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
 53
 54            # Find the positions of the second row of numbers
 55            if len(ygrid) > 2:
 56                for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])):
 57                    nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, self.bbox.right, ygrid[ypos1][0].p0.y)
 58                    if lines := self._page.charlines_in_area(nbbox):
 59                        if all(c.char.isnumeric() or c.unicode in {0x20, 0xA, 0xD} for c in lines[0].chars):
 60                            if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16:
 61                                clusters.append((cluster, nbbox))
 62                                self._bit_headers = len(ygrid) - yi - 1
 63                            else:
 64                                self.grid = (len(cluster), 0)
 65                                _LOGGER.warning(
 66                                    f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})"
 67                                )
 68                            break
 69
 70            # Merge these clusters to find their positions
 71            for cluster, bbox in clusters:
 72                # Close left and right side
 73                xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top))
 74                xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top))
 75                # Now close the lines in between
 76                for cleft, cright in zip(cluster, cluster[1:]):
 77                    # find a line between the clusters
 78                    xpos = next(
 79                        (
 80                            (x, xgrid[x][0].p0.x)
 81                            for x in sorted(xgrid)
 82                            if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left
 83                        ),
 84                        None,
 85                    )
 86                    # Didn't find one, we must add one manually
 87                    if xpos is None:
 88                        xpos = (cleft.bbox.right + cright.bbox.left) / 2
 89                        xpos = (int(round(xpos)), xpos)
 90                    # Add it to the grid
 91                    xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top))
 92            # close the top
 93            ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right))
 94
 95        # Fix the position keys properly
 96        self._xgrid = {int(round(statistics.fmean(m.p0.x for m in line))): line for line in xgrid.values()}
 97        self._ygrid = {int(round(statistics.fmean(m.p0.y for m in line))): line for line in ygrid.values()}
 98        # Map the positions to integers
 99        self._xpos = list(sorted(self._xgrid))
100        self._ypos = list(sorted(self._ygrid))
101
102        self.grid = (len(self._xpos) - 1, len(self._ypos) - 1)
103        self._cells = None
104
105    def _cell_borders(self, x: int, y: int, bbox: Rectangle, mask: int = 0b1111) -> tuple[int, int, int, int]:
106        # left, bottom, right, top
107        borders = [0, 0, 0, 0]
108        mp = bbox.midpoint
109        if mask & 0b1000:  # Left
110            for line in self._xgrid[self._xpos[x]]:
111                if line.p0.y < mp.y < line.p1.y:
112                    borders[0] = line.width
113                    assert line.width
114                    break
115        if mask & 0b0010:  # Right
116            for line in self._xgrid[self._xpos[x + 1]]:
117                if line.p0.y < mp.y < line.p1.y:
118                    borders[2] = line.width
119                    assert line.width
120                    break
121        if mask & 0b0100:  # Bottom
122            for line in self._ygrid[self._ypos[y]]:
123                if line.p0.x < mp.x < line.p1.x:
124                    borders[1] = line.width
125                    assert line.width
126                    break
127        if mask & 0b0001:  # Top
128            for line in self._ygrid[self._ypos[y + 1]]:
129                if line.p0.x < mp.x < line.p1.x:
130                    borders[3] = line.width
131                    assert line.width
132                    break
133
134        return Borders(*borders)
135
136    def _fix_borders(self, cells, x: int, y: int):
137        # We are looking at the 9 neighbors around the cells
138        c = cells[(x, y)].borders
139        r = cells[(x + 1, y)].borders if cells[(x + 1, y)] is not None else Borders(0, 0, 1, 0)
140        t = cells[(x, y + 1)].borders if cells[(x, y + 1)] is not None else Borders(0, 1, 0, 0)
141
142        # if (not c.top and c.left and c.right and c.bottom) and "Reset value" in cell.content:
143        #     c.top = 1
144
145        # Open at the top into a span
146        if (not c.top and c.right) and (not t.right or not t.left):
147            c.top = 1
148            t.bottom = 1
149        # Open at the top and self is a span
150        if (not c.top and not c.right) and (t.right and t.left):
151            c.top = 1
152            t.bottom = 1
153
154        # Open to the right into a span
155        if (not c.right and c.top) and (not r.top or not r.bottom):
156            c.right = 1
157            r.left = 1
158        # Open to the right and self is a span
159        if (not c.right and not c.top) and (r.top and r.bottom):
160            c.right = 1
161            r.left = 1
162
163    @property
164    def cells(self) -> list[Cell]:
165        if self._cells is None:
166            if self.grid < (1, 1):
167                self._cells = []
168                return self._cells
169
170            # First determine the spans of cells by checking the borders
171            cells = defaultdict(lambda: None)
172            for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])):
173                for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])):
174                    bbox = Rectangle(x0, y0, x1, y1)
175                    borders = self._cell_borders(xi, yi, bbox, 0b1111)
176                    cells[(xi, yi)] = Cell(self, (self.grid[1] - 1 - yi, xi), bbox, borders, self._type == "register")
177
178            # Fix table cell borders via consistency checks
179            for yi in range(self.grid[1]):
180                for xi in range(self.grid[0]):
181                    self._fix_borders(cells, xi, yi)
182
183            # Merge the cells recursively
184            def _merge(px, py, x, y):
185                if cells[(x, y)] is None:
186                    return
187                # print(cells[(x, y)])
188                # Right border is open
189                if not cells[(x, y)].borders.right:
190                    if cells[(x + 1, y)] is not None:
191                        cells[(px, py)]._merge(cells[(x + 1, y)])
192                        _merge(px, py, x + 1, y)
193                        cells[(x + 1, y)] = None
194                # Top border is open
195                if not cells[(x, y)].borders.top:
196                    if cells[(x, y + 1)] is not None:
197                        cells[(px, py)]._merge(cells[(x, y + 1)])
198                        _merge(px, py, x, y + 1)
199                        cells[(x, y + 1)] = None
200
201            # Start merging in bottom left cell
202            for yi in range(self.grid[1]):
203                for xi in range(self.grid[0]):
204                    _merge(xi, yi, xi, yi)
205
206            # Find the header line, it is thicker than normal
207            y_header_pos = self.grid[1]
208            if self._type != "register":
209                if self.grid[1] > 1:
210                    line_widths = {
211                        round(line.width, 1) for llist in self._ygrid.values() for line in llist if line.width != 0.1
212                    }  # magic width of virtual borders
213                    if line_widths:
214                        line_width_max = max(line_widths) * 0.9
215                        if min(line_widths) < line_width_max:
216                            # Find the first thick line starting from the top
217                            y_header_pos = next(
218                                (
219                                    yi
220                                    for yi, ypos in reversed(list(enumerate(self._ypos)))
221                                    if any(line.width > line_width_max for line in self._ygrid[ypos])
222                                ),
223                                y_header_pos,
224                            )
225
226                # Map all the header
227                is_bold = []
228                for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]):
229                    bbox = None
230                    for xi in range(self.grid[0]):
231                        if (cell := cells[(xi, yi)]) is not None:
232                            if bbox is None:
233                                bbox = cell.bbox
234                            else:
235                                bbox = bbox.joined(cell.bbox)
236                    if bbox is None:
237                        continue
238                    chars = self._page.chars_in_area(bbox)
239                    is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1
240                    is_bold.append((yi, is_bold_pct > self._spacing["th"]))
241
242                # Some tables have no bold cells at all
243                if all(not b[1] for b in is_bold):
244                    # Special case for two row tables without bold headers, but a bold line inbetween
245                    if self.grid[1] == 2 and y_header_pos == 1:
246                        y_header_pos = 2
247                else:
248                    if y_header_pos < self.grid[1]:
249                        # Find the lowest bold row starting from bold line
250                        y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos)
251                    else:
252                        # Find the lowest bold row starting from the top
253                        for b in reversed(is_bold):
254                            if not b[1]:
255                                break
256                            y_header_pos = b[0]
257
258            # Tell the header cells
259            for yi in range(y_header_pos, self.grid[1]):
260                for xi in range(self.grid[0]):
261                    if (cell := cells[(xi, yi)]) is not None:
262                        cell.is_header = True
263
264            # Flatten into array
265            cells = [c for c in cells.values() if c is not None]
266
267            # Normalize cells for registers by moving the lower ones right and up
268            if self._type == "register" and self._bit_headers is not None:
269                for cell in cells:
270                    if cell.y >= self._bit_headers:
271                        cell._move(16, -self._bit_headers)
272                    elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1:
273                        cell._expand(0, 3 - self._bit_headers)
274                self.grid = (32, 4)
275
276            self._cells = list(sorted(cells, key=lambda c: c.positions[0]))
277
278        return self._cells
279
280    def append_bottom(self, other, merge_headers=True) -> bool:
281        debug = False
282        xgrid = self.grid[0]
283        if merge_headers and xgrid != other.grid[0]:
284            # Some tables have different column layouts due to span cells
285            # So we must correct the X positions of all cells accordingly
286            self_xheaders = defaultdict(set)
287            other_xheaders = defaultdict(set)
288            self_headers = [c for c in self.cells if c.is_header]
289            other_headers = [c for c in other.cells if c.is_header]
290            # Find the smallest set of spanning xpositions based on the header cells
291            for xpos in range(self.grid[0]):
292                for hcell in self_headers:
293                    if any(p[1] == xpos for p in hcell.positions):
294                        self_xheaders[hcell.x].add(xpos)
295            for xpos in range(other.grid[0]):
296                for hcell in other_headers:
297                    if any(p[1] == xpos for p in hcell.positions):
298                        other_xheaders[hcell.x].add(xpos)
299
300            # Compute the shared
301            self_heads = sorted(self_xheaders.keys())
302            other_heads = sorted(other_xheaders.keys())
303            xgrid = 0
304            merged_xheaders = defaultdict(set)
305            # Zip the groups together, these represent the matching header group spans
306            for self_xhead, other_xhead in zip(self_heads, other_heads):
307                size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead]))
308                merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size))
309                xgrid += size
310
311            if debug:
312                print(len(self_xheaders), self_xheaders)
313                print(len(other_xheaders), other_xheaders)
314                print(len(merged_xheaders), merged_xheaders)
315            # If they are not equal length the table layouts are not compatible at all!
316            if len(self_heads) != len(other_heads):
317                _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
318                return False
319
320            # We want to stuff/move the cell positions inplace, therefore we start
321            # backwards moving the high numbers even higher, so that we don't
322            # overwrite ourselves and get stuck in an infinite loop
323            # Zip the groups together, these represent the matching header group spans
324            for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)):
325                merged_xhead = max(self_xhead, other_xhead)
326                self_xpos = sorted(self_xheaders[self_xhead], reverse=True)
327                other_xpos = sorted(other_xheaders[other_xhead], reverse=True)
328                merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True)
329
330                def _insert_cells(cell, src, dsts, insert_only):
331                    assert dsts
332                    new_positions = []
333                    any_change = False
334                    for cpos in reversed(cell.positions):
335                        if insert_only:
336                            # If our set is empty we must only insert positions
337                            if cpos[1] == src:
338                                for xpos in dsts:
339                                    if debug:
340                                        print(f"Insert {cpos}++{(cpos[0], xpos)}")
341                                    new_positions.append((cpos[0], xpos))
342                                    any_change = True
343                            new_positions.append(cpos)
344                        else:
345                            # We must move (=replace and add) the span positions
346                            if cpos[1] == src:
347                                if debug:
348                                    print(f"Move {cpos}->{(cpos[0], dsts[0])}")
349                                new_positions.append((cpos[0], dsts[0]))
350                                any_change = True
351                            else:
352                                new_positions.append(cpos)
353                    if debug and any_change:
354                        print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}")
355                        print("old=", cell.positions, "new=", sorted(new_positions))
356                        print()
357                    assert new_positions
358                    assert len(new_positions) == len(set(new_positions))
359                    cell.positions = sorted(new_positions)
360                    cell._invalidate()
361
362                def _move_cells(cells, own_xpos):
363                    if debug:
364                        print()
365                        print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======")
366                        print()
367
368                    for ii in range(max(len(own_xpos), len(merged_xpos))):
369                        insert_only = ii >= len(own_xpos)
370                        if insert_only:
371                            src = merged_xpos[ii - 1]
372                            dsts = merged_xpos[ii:]
373                            if debug:
374                                print(f"{src}->{dsts} I")
375                            for cell in cells:
376                                _insert_cells(cell, src, dsts, True)
377                            break
378                        else:
379                            src = own_xpos[ii]
380                            dsts = merged_xpos[ii : ii + 1]
381                            if debug:
382                                print(f"{src}->{dsts} M")
383                            for cell in cells:
384                                _insert_cells(cell, src, dsts, False)
385
386                if debug:
387                    print()
388                if self_xpos != merged_xpos:
389                    if debug:
390                        print(f"====== Self:  x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}")
391                    _move_cells(self.cells, self_xpos)
392                if other_xpos != merged_xpos:
393                    if debug:
394                        print(
395                            f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}"
396                        )
397                    _move_cells(other.cells, other_xpos)
398            if debug:
399                print()
400                print()
401                print()
402
403        # We must move the cells downwards now, but minus the header rows
404        rows = self.grid[1] - other.header_rows
405        for cell in other.cells:
406            # Discard the header cells, we just assume they are the same
407            if not cell.is_header:
408                cell._move(0, rows)
409                self.cells.append(cell)
410        self.cells.sort(key=lambda c: c.positions[0])
411        self.grid = (xgrid, other.grid[1] + rows)
412        if debug:
413            print(f"{self._page} -> {self.grid}")
414        return True
415
416    def append_side(self, other, expand=False) -> bool:
417        if self.grid[1] != other.grid[1]:
418            if expand:
419                _LOGGER.debug(
420                    f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})"
421                )
422                ymin = min(self.grid[1], other.grid[1])
423                ymax = max(self.grid[1], other.grid[1])
424                etable = other if self.grid[1] > other.grid[1] else self
425                for cell in etable.cells:
426                    if any(p[0] == ymin - 1 for p in cell.positions):
427                        cell._expand(0, ymax - ymin)
428                etable.grid = (etable.grid[0], ymax)
429            else:
430                _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
431                return False
432
433        # We must move all cells to the right now
434        columns = self.grid[0]
435        for cell in other.cells:
436            cell._move(columns, 0)
437            self.cells.append(cell)
438        self.cells.sort(key=lambda c: c.positions[0])
439        self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1]))
440        return True
441
442    @cached_property
443    def header_rows(self) -> int:
444        header_cells = [c for c in self.cells if c.is_header]
445        if header_cells:
446            return max(c.positions[-1][0] + 1 for c in header_cells)
447        return 0
448
449    def __repr__(self) -> str:
450        return f"Table({self.grid[0]}x{self.grid[1]})"
Table( page, bbox: modm_data.utils.Rectangle, xlines: list, ylines: list, cbbox: modm_data.utils.Rectangle = None, is_register: bool = False)
 16    def __init__(
 17        self, page, bbox: Rectangle, xlines: list, ylines: list, cbbox: Rectangle = None, is_register: bool = False
 18    ):
 19        self._page = page
 20        self._spacing = page._spacing
 21        self.bbox = bbox
 22        self.cbbox = None if is_register else cbbox
 23        self._type = "table"
 24        self._bit_headers = None
 25
 26        # Coalesce the vertical lines to detect the grid
 27        def _cluster(lines, key):
 28            atol = min(self._spacing["y_em"], self._spacing["x_em"]) / 4
 29            grid = defaultdict(list)
 30            last = -1e9
 31            current = -1e9
 32            for line in sorted(lines, key=key):
 33                if (last + atol) < key(line):
 34                    current = key(line)
 35                grid[current].append(line)
 36                last = key(line)
 37            return grid
 38
 39        xgrid = _cluster(xlines, lambda line: line.p0.x)
 40        ygrid = _cluster(ylines, lambda line: line.p0.y)
 41
 42        if is_register:
 43            self._type = "register"
 44
 45            # Find the positions of the top numbers
 46            clusters = []
 47            if lines := self._page.charlines_in_area(cbbox):
 48                if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)):
 49                    clusters.append((cluster, cbbox))
 50                else:
 51                    self.grid = (0, 0)
 52                    _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
 53
 54            # Find the positions of the second row of numbers
 55            if len(ygrid) > 2:
 56                for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])):
 57                    nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, self.bbox.right, ygrid[ypos1][0].p0.y)
 58                    if lines := self._page.charlines_in_area(nbbox):
 59                        if all(c.char.isnumeric() or c.unicode in {0x20, 0xA, 0xD} for c in lines[0].chars):
 60                            if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16:
 61                                clusters.append((cluster, nbbox))
 62                                self._bit_headers = len(ygrid) - yi - 1
 63                            else:
 64                                self.grid = (len(cluster), 0)
 65                                _LOGGER.warning(
 66                                    f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})"
 67                                )
 68                            break
 69
 70            # Merge these clusters to find their positions
 71            for cluster, bbox in clusters:
 72                # Close left and right side
 73                xgrid[sorted(xgrid)[0]].append(VLine(self.bbox.left, bbox.bottom, bbox.top))
 74                xgrid[sorted(xgrid)[-1]].append(VLine(self.bbox.right, bbox.bottom, bbox.top))
 75                # Now close the lines in between
 76                for cleft, cright in zip(cluster, cluster[1:]):
 77                    # find a line between the clusters
 78                    xpos = next(
 79                        (
 80                            (x, xgrid[x][0].p0.x)
 81                            for x in sorted(xgrid)
 82                            if cleft.bbox.right < xgrid[x][0].p0.x < cright.bbox.left
 83                        ),
 84                        None,
 85                    )
 86                    # Didn't find one, we must add one manually
 87                    if xpos is None:
 88                        xpos = (cleft.bbox.right + cright.bbox.left) / 2
 89                        xpos = (int(round(xpos)), xpos)
 90                    # Add it to the grid
 91                    xgrid[xpos[0]].append(VLine(xpos[1], bbox.bottom, bbox.top))
 92            # close the top
 93            ygrid[self.bbox.top].append(HLine(self.bbox.top, self.bbox.left, self.bbox.right))
 94
 95        # Fix the position keys properly
 96        self._xgrid = {int(round(statistics.fmean(m.p0.x for m in line))): line for line in xgrid.values()}
 97        self._ygrid = {int(round(statistics.fmean(m.p0.y for m in line))): line for line in ygrid.values()}
 98        # Map the positions to integers
 99        self._xpos = list(sorted(self._xgrid))
100        self._ypos = list(sorted(self._ygrid))
101
102        self.grid = (len(self._xpos) - 1, len(self._ypos) - 1)
103        self._cells = None
bbox
cbbox
grid
cells: list[modm_data.pdf2html.cell.Cell]
163    @property
164    def cells(self) -> list[Cell]:
165        if self._cells is None:
166            if self.grid < (1, 1):
167                self._cells = []
168                return self._cells
169
170            # First determine the spans of cells by checking the borders
171            cells = defaultdict(lambda: None)
172            for yi, (y0, y1) in enumerate(zip(self._ypos, self._ypos[1:])):
173                for xi, (x0, x1) in enumerate(zip(self._xpos, self._xpos[1:])):
174                    bbox = Rectangle(x0, y0, x1, y1)
175                    borders = self._cell_borders(xi, yi, bbox, 0b1111)
176                    cells[(xi, yi)] = Cell(self, (self.grid[1] - 1 - yi, xi), bbox, borders, self._type == "register")
177
178            # Fix table cell borders via consistency checks
179            for yi in range(self.grid[1]):
180                for xi in range(self.grid[0]):
181                    self._fix_borders(cells, xi, yi)
182
183            # Merge the cells recursively
184            def _merge(px, py, x, y):
185                if cells[(x, y)] is None:
186                    return
187                # print(cells[(x, y)])
188                # Right border is open
189                if not cells[(x, y)].borders.right:
190                    if cells[(x + 1, y)] is not None:
191                        cells[(px, py)]._merge(cells[(x + 1, y)])
192                        _merge(px, py, x + 1, y)
193                        cells[(x + 1, y)] = None
194                # Top border is open
195                if not cells[(x, y)].borders.top:
196                    if cells[(x, y + 1)] is not None:
197                        cells[(px, py)]._merge(cells[(x, y + 1)])
198                        _merge(px, py, x, y + 1)
199                        cells[(x, y + 1)] = None
200
201            # Start merging in bottom left cell
202            for yi in range(self.grid[1]):
203                for xi in range(self.grid[0]):
204                    _merge(xi, yi, xi, yi)
205
206            # Find the header line, it is thicker than normal
207            y_header_pos = self.grid[1]
208            if self._type != "register":
209                if self.grid[1] > 1:
210                    line_widths = {
211                        round(line.width, 1) for llist in self._ygrid.values() for line in llist if line.width != 0.1
212                    }  # magic width of virtual borders
213                    if line_widths:
214                        line_width_max = max(line_widths) * 0.9
215                        if min(line_widths) < line_width_max:
216                            # Find the first thick line starting from the top
217                            y_header_pos = next(
218                                (
219                                    yi
220                                    for yi, ypos in reversed(list(enumerate(self._ypos)))
221                                    if any(line.width > line_width_max for line in self._ygrid[ypos])
222                                ),
223                                y_header_pos,
224                            )
225
226                # Map all the header
227                is_bold = []
228                for yi in range(0 if y_header_pos == self.grid[1] else y_header_pos, self.grid[1]):
229                    bbox = None
230                    for xi in range(self.grid[0]):
231                        if (cell := cells[(xi, yi)]) is not None:
232                            if bbox is None:
233                                bbox = cell.bbox
234                            else:
235                                bbox = bbox.joined(cell.bbox)
236                    if bbox is None:
237                        continue
238                    chars = self._page.chars_in_area(bbox)
239                    is_bold_pct = sum(1 if "Bold" in c.font else 0 for c in chars) / len(chars) if chars else 1
240                    is_bold.append((yi, is_bold_pct > self._spacing["th"]))
241
242                # Some tables have no bold cells at all
243                if all(not b[1] for b in is_bold):
244                    # Special case for two row tables without bold headers, but a bold line inbetween
245                    if self.grid[1] == 2 and y_header_pos == 1:
246                        y_header_pos = 2
247                else:
248                    if y_header_pos < self.grid[1]:
249                        # Find the lowest bold row starting from bold line
250                        y_header_pos = next((b[0] for b in is_bold if y_header_pos <= b[0] and b[1]), y_header_pos)
251                    else:
252                        # Find the lowest bold row starting from the top
253                        for b in reversed(is_bold):
254                            if not b[1]:
255                                break
256                            y_header_pos = b[0]
257
258            # Tell the header cells
259            for yi in range(y_header_pos, self.grid[1]):
260                for xi in range(self.grid[0]):
261                    if (cell := cells[(xi, yi)]) is not None:
262                        cell.is_header = True
263
264            # Flatten into array
265            cells = [c for c in cells.values() if c is not None]
266
267            # Normalize cells for registers by moving the lower ones right and up
268            if self._type == "register" and self._bit_headers is not None:
269                for cell in cells:
270                    if cell.y >= self._bit_headers:
271                        cell._move(16, -self._bit_headers)
272                    elif self._bit_headers <= 2 and cell.y == self._bit_headers - 1:
273                        cell._expand(0, 3 - self._bit_headers)
274                self.grid = (32, 4)
275
276            self._cells = list(sorted(cells, key=lambda c: c.positions[0]))
277
278        return self._cells
def append_bottom(self, other, merge_headers=True) -> bool:
280    def append_bottom(self, other, merge_headers=True) -> bool:
281        debug = False
282        xgrid = self.grid[0]
283        if merge_headers and xgrid != other.grid[0]:
284            # Some tables have different column layouts due to span cells
285            # So we must correct the X positions of all cells accordingly
286            self_xheaders = defaultdict(set)
287            other_xheaders = defaultdict(set)
288            self_headers = [c for c in self.cells if c.is_header]
289            other_headers = [c for c in other.cells if c.is_header]
290            # Find the smallest set of spanning xpositions based on the header cells
291            for xpos in range(self.grid[0]):
292                for hcell in self_headers:
293                    if any(p[1] == xpos for p in hcell.positions):
294                        self_xheaders[hcell.x].add(xpos)
295            for xpos in range(other.grid[0]):
296                for hcell in other_headers:
297                    if any(p[1] == xpos for p in hcell.positions):
298                        other_xheaders[hcell.x].add(xpos)
299
300            # Compute the shared
301            self_heads = sorted(self_xheaders.keys())
302            other_heads = sorted(other_xheaders.keys())
303            xgrid = 0
304            merged_xheaders = defaultdict(set)
305            # Zip the groups together, these represent the matching header group spans
306            for self_xhead, other_xhead in zip(self_heads, other_heads):
307                size = max(len(self_xheaders[self_xhead]), len(other_xheaders[other_xhead]))
308                merged_xheaders[max(self_xhead, other_xhead)] = set(range(xgrid, xgrid + size))
309                xgrid += size
310
311            if debug:
312                print(len(self_xheaders), self_xheaders)
313                print(len(other_xheaders), other_xheaders)
314                print(len(merged_xheaders), merged_xheaders)
315            # If they are not equal length the table layouts are not compatible at all!
316            if len(self_heads) != len(other_heads):
317                _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
318                return False
319
320            # We want to stuff/move the cell positions inplace, therefore we start
321            # backwards moving the high numbers even higher, so that we don't
322            # overwrite ourselves and get stuck in an infinite loop
323            # Zip the groups together, these represent the matching header group spans
324            for self_xhead, other_xhead in zip(reversed(self_heads), reversed(other_heads)):
325                merged_xhead = max(self_xhead, other_xhead)
326                self_xpos = sorted(self_xheaders[self_xhead], reverse=True)
327                other_xpos = sorted(other_xheaders[other_xhead], reverse=True)
328                merged_xpos = sorted(merged_xheaders[merged_xhead], reverse=True)
329
330                def _insert_cells(cell, src, dsts, insert_only):
331                    assert dsts
332                    new_positions = []
333                    any_change = False
334                    for cpos in reversed(cell.positions):
335                        if insert_only:
336                            # If our set is empty we must only insert positions
337                            if cpos[1] == src:
338                                for xpos in dsts:
339                                    if debug:
340                                        print(f"Insert {cpos}++{(cpos[0], xpos)}")
341                                    new_positions.append((cpos[0], xpos))
342                                    any_change = True
343                            new_positions.append(cpos)
344                        else:
345                            # We must move (=replace and add) the span positions
346                            if cpos[1] == src:
347                                if debug:
348                                    print(f"Move {cpos}->{(cpos[0], dsts[0])}")
349                                new_positions.append((cpos[0], dsts[0]))
350                                any_change = True
351                            else:
352                                new_positions.append(cpos)
353                    if debug and any_change:
354                        print(f"{cell}: {src}->{dsts} {'I' if insert_only else 'M'}")
355                        print("old=", cell.positions, "new=", sorted(new_positions))
356                        print()
357                    assert new_positions
358                    assert len(new_positions) == len(set(new_positions))
359                    cell.positions = sorted(new_positions)
360                    cell._invalidate()
361
362                def _move_cells(cells, own_xpos):
363                    if debug:
364                        print()
365                        print(f"====== Rewrite rows: {own_xpos}->{merged_xpos} ======")
366                        print()
367
368                    for ii in range(max(len(own_xpos), len(merged_xpos))):
369                        insert_only = ii >= len(own_xpos)
370                        if insert_only:
371                            src = merged_xpos[ii - 1]
372                            dsts = merged_xpos[ii:]
373                            if debug:
374                                print(f"{src}->{dsts} I")
375                            for cell in cells:
376                                _insert_cells(cell, src, dsts, True)
377                            break
378                        else:
379                            src = own_xpos[ii]
380                            dsts = merged_xpos[ii : ii + 1]
381                            if debug:
382                                print(f"{src}->{dsts} M")
383                            for cell in cells:
384                                _insert_cells(cell, src, dsts, False)
385
386                if debug:
387                    print()
388                if self_xpos != merged_xpos:
389                    if debug:
390                        print(f"====== Self:  x={self_xhead}->{merged_xhead} xpos={self_xpos}->{merged_xpos}")
391                    _move_cells(self.cells, self_xpos)
392                if other_xpos != merged_xpos:
393                    if debug:
394                        print(
395                            f"====== Other: x={other_xhead}->{merged_xhead} xpos={other_xheaders[other_xhead]}->{merged_xheaders[merged_xhead]}"
396                        )
397                    _move_cells(other.cells, other_xpos)
398            if debug:
399                print()
400                print()
401                print()
402
403        # We must move the cells downwards now, but minus the header rows
404        rows = self.grid[1] - other.header_rows
405        for cell in other.cells:
406            # Discard the header cells, we just assume they are the same
407            if not cell.is_header:
408                cell._move(0, rows)
409                self.cells.append(cell)
410        self.cells.sort(key=lambda c: c.positions[0])
411        self.grid = (xgrid, other.grid[1] + rows)
412        if debug:
413            print(f"{self._page} -> {self.grid}")
414        return True
def append_side(self, other, expand=False) -> bool:
416    def append_side(self, other, expand=False) -> bool:
417        if self.grid[1] != other.grid[1]:
418            if expand:
419                _LOGGER.debug(
420                    f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})"
421                )
422                ymin = min(self.grid[1], other.grid[1])
423                ymax = max(self.grid[1], other.grid[1])
424                etable = other if self.grid[1] > other.grid[1] else self
425                for cell in etable.cells:
426                    if any(p[0] == ymin - 1 for p in cell.positions):
427                        cell._expand(0, ymax - ymin)
428                etable.grid = (etable.grid[0], ymax)
429            else:
430                _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
431                return False
432
433        # We must move all cells to the right now
434        columns = self.grid[0]
435        for cell in other.cells:
436            cell._move(columns, 0)
437            self.cells.append(cell)
438        self.cells.sort(key=lambda c: c.positions[0])
439        self.grid = (other.grid[0] + columns, max(self.grid[1], other.grid[1]))
440        return True
header_rows: int
442    @cached_property
443    def header_rows(self) -> int:
444        header_cells = [c for c in self.cells if c.is_header]
445        if header_cells:
446            return max(c.positions[-1][0] + 1 for c in header_cells)
447        return 0
class VirtualTable(Table):
453class VirtualTable(Table):
454    def __init__(self, page, bbox, cells, table_type=None):
455        self._page = page
456        self._spacing = page._spacing
457        self._type = table_type or "virtual"
458        self.bbox = bbox
459        self._cells = cells
460        self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1)
461        for cell in cells:
462            cell._table = self
463
464    def __repr__(self) -> str:
465        return f"VTable({self.grid[0]}x{self.grid[1]})"
VirtualTable(page, bbox, cells, table_type=None)
454    def __init__(self, page, bbox, cells, table_type=None):
455        self._page = page
456        self._spacing = page._spacing
457        self._type = table_type or "virtual"
458        self.bbox = bbox
459        self._cells = cells
460        self.grid = (max(c.x for c in cells) + 1, max(c.y for c in cells) + 1)
461        for cell in cells:
462            cell._table = self
bbox
grid