modm_data.pdf2html.ast

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4import logging
  5import anytree
  6from anytree import RenderTree, Node
  7from collections import defaultdict
  8from ..utils import Rectangle, ReversePreOrderIter
  9from .table import VirtualTable, Cell
 10
 11_LOGGER = logging.getLogger(__name__)
 12
 13
 14def _normalize_area(area: Node) -> Node:
 15    for child in ReversePreOrderIter(area):
 16        if child.name.startswith("list"):
 17            # We need to normalize the xpos back to the first character
 18            child.xpos = int(child.obj.bbox.left) - area.xpos
 19        else:
 20            # And then make the xpos relative to the area left for consistent comparisons
 21            child.xpos -= area.xpos
 22    area.xpos = 0
 23    return area
 24
 25
 26def merge_area(document: Node, area: Node, debug: bool = False) -> Node:
 27    if document is None:
 28        document = Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None)
 29        document._end = document
 30    if not area.children:
 31        return document
 32    if debug:
 33        _LOGGER.debug()
 34
 35    def _find_end(node):
 36        # Find the last leaf node but skip lines, paragraphs, captions/tables/figures
 37        return next(
 38            (c for c in ReversePreOrderIter(node) if any(c.name.startswith(name) for name in {"head", "list", "note"})),
 39            next(ReversePreOrderIter(node), node),
 40        )
 41
 42    def _find_ancestor(filter_):
 43        if filter_(document._end):
 44            return document._end
 45        return next((c for c in document._end.iter_path_reverse() if filter_(c)), document.root)
 46
 47    area = _normalize_area(area)
 48    if debug:
 49        _LOGGER.debug(RenderTree(area))
 50    children = area.children
 51    # All area nodes up to the next top-level element must now be
 52    # xpos-aligned with the previous area's last leaf node
 53    connect_index = next((ii for ii, c in enumerate(children) if c.name.startswith("head")), len(children))
 54    x_em = area.page._spacing["x_em"]
 55
 56    if debug:
 57        _LOGGER.debug("area=", area, "connect_index=", connect_index)
 58    # Align these children with the last leaf node xpos
 59    for child in children[:connect_index]:
 60        if any(child.name.startswith(name) for name in {"list"}):
 61            # Find the node that is left of the current node but not too far left
 62            host = _find_ancestor(lambda c: -4 * x_em < (c.xpos - child.xpos) < -x_em or c.name.startswith("head"))
 63        elif (
 64            child.name == "para"
 65            and document._end.name == "note"
 66            and child.children[0].obj.contains_font("Italic", "Oblique")
 67        ):
 68            host = document._end
 69        else:
 70            # Insert underneath the next heading
 71            host = _find_ancestor(lambda c: c.name.startswith("head"))
 72
 73        child.parent = host
 74        document._end = _find_end(document)
 75        if debug:
 76            _LOGGER.debug(
 77                f"{child=}",
 78            )
 79            _LOGGER.debug(f"{host=}")
 80            _LOGGER.debug(f"end={document._end}")
 81            _LOGGER.debug()
 82
 83    # Add the remaining top-level children to connect index node
 84    if connect_index < len(children):
 85        children[connect_index].parent = document
 86        for child in children[connect_index + 1 :]:
 87            child.parent = children[connect_index]
 88
 89    document._end = _find_end(document)
 90
 91    if debug:
 92        _LOGGER.debug()
 93        _LOGGER.debug()
 94
 95    return document
 96
 97
 98def normalize_lists(node: Node) -> Node:
 99    lists = []
100    current = []
101    current_name = None
102    for child in node.children:
103        # Normalize the lists from the leaves up
104        normalize_lists(child)
105        # then split the children based on their names
106        if current_name is None or child.name == current_name:
107            current.append(child)
108        else:
109            lists.append(current)
110            current = [child]
111        current_name = child.name
112    if current:
113        lists.append(current)
114
115    # Create a new list of children
116    new_children = []
117    for llist in lists:
118        # Insert a new list group node and redirect all children to it
119        if llist[0].name.startswith("list"):
120            nlist = Node(llist[0].name, obj=llist[0].obj, start=llist[0].value, xpos=llist[0].xpos)
121            for lnode in llist:
122                lnode.name = "element"
123                lnode.parent = nlist
124
125            new_children.append(nlist)
126        else:
127            new_children.extend(llist)
128
129    # Set the new children which have the same order
130    node.children = new_children
131    return node
132
133
134def normalize_paragraphs(document: Node) -> Node:
135    paras = anytree.search.findall(document, filter_=lambda n: n.name == "para")
136    parents = set(p.parent for p in paras if p.parent.name in {"element", "caption", "document", "cell"})
137    for parent in parents:
138        # Replace the paragraph only if it's the *only* paragraph in this node
139        if parent.name in {"caption"} or sum(1 for p in parent.children if p.name == "para") == 1:
140            # Replace like this to preserve children order
141            parent.children = [p.children[0] if p.name == "para" else p for p in parent.children]
142            # Now we need to merge the text tags into the first one
143            texts = [p for p in parent.children if p.name == "text"]
144            if len(texts) > 1:
145                first_text = texts[0]
146                for text in texts[1:]:
147                    for line in text.children:
148                        line.parent = first_text
149                    text.parent = None
150    return document
151
152
153def normalize_lines(document: Node) -> Node:
154    paras = anytree.search.findall(document, filter_=lambda n: n.name == "para")
155    for para in paras:
156        text = Node("text")
157        for line in para.children:
158            line.parent = text
159        para.children = [text]
160    return document
161
162
163def normalize_captions(document: Node) -> Node:
164    captions = anytree.search.findall(document, filter_=lambda n: n.name == "caption")
165    for caption in captions:
166        cindex = caption.parent.children.index(caption)
167        # Find the next table for this caption within 5 nodes
168        for sibling in caption.parent.children[cindex : cindex + 6]:
169            if sibling.name == caption._type:
170                caption.parent = sibling
171                sibling.number = caption.number
172                break
173        else:
174            _LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}")
175            caption.parent = None
176    return document
177
178
179def normalize_headings(document: Node) -> Node:
180    headings = anytree.search.findall(document, filter_=lambda n: n.name.startswith("head"))
181    for heading in headings:
182        para = heading.children[0]
183        if not para.children[0].children:
184            # Remove empty headers
185            para.parent = None
186        else:
187            # Rename paragraph to heading
188            para.__dict__["marker"] = heading.marker
189            para.name = heading.name
190        heading.name = "section"
191    return document
192
193
194def normalize_registers(document: Node) -> Node:
195    bits_list = []
196    sections = anytree.search.findall(document, filter_=lambda n: n.name == "section")
197    for section in sections + (document,):
198        new_children = []
199        bits = None
200        for child in section.children:
201            if child.name == "bit":
202                # Insert a new bits group node and redirect all children to it
203                if bits is None or bits._page != child._page:
204                    bits = Node("table", xpos=child.xpos, obj=None, _type="bits", _width=1, _page=child._page)
205                    new_children.append(bits)
206                    bits_list.append(bits)
207                child.parent = bits
208            else:
209                bits = None
210                new_children.append(child)
211        # Set the new children which have the same order
212        section.children = new_children
213
214    # Reformat the bits nodes into tables
215    for bits in bits_list:
216        cells = []
217        for ypos, bit in enumerate(bits.children):
218            bit.parent = None
219            # The top is the first line, the bottom by the last line
220            top = next(c.obj.bbox.top for c in bit.descendants if c.name == "line")
221            bottom = next(c.obj.bbox.bottom for c in reversed(bit.descendants) if c.name == "line")
222            # Left table cell contains Bits
223            left_bbox = Rectangle(bit._left, bottom, bit._middle, top)
224            cells.append(Cell(None, (ypos, 0), left_bbox, (1, 1, 1, 1), is_simple=True))
225            # Right cell contains description
226            right_bbox = Rectangle(bit._middle, bottom, bit._right, top)
227            cells.append(Cell(None, (ypos, 1), right_bbox, (1, 1, 1, 1)))
228        tbbox = Rectangle(
229            min(c.bbox.left for c in cells),
230            min(c.bbox.bottom for c in cells),
231            max(c.bbox.right for c in cells),
232            max(c.bbox.top for c in cells),
233        )
234        bits.obj = VirtualTable(bits._page, tbbox, cells, "bitfield")
235
236    return document
237
238
239def normalize_tables(document: Node) -> Node:
240    content_tables = defaultdict(list)
241    register_tables = []
242    bits_tables = []
243    current_rtables = []
244    current_bitstables = []
245
246    def _push():
247        nonlocal current_rtables, register_tables
248        nonlocal current_bitstables, bits_tables
249        if current_rtables:
250            register_tables.append(current_rtables)
251            current_rtables = []
252        if current_bitstables:
253            bits_tables.append(current_bitstables)
254            current_bitstables = []
255
256    sections = anytree.search.findall(document, filter_=lambda n: n.name == "section")
257    last_number = 0
258    for section in sections + (document,):
259        current_rtables = []
260        current_bitstables = []
261        for child in section.children:
262            if child.name == "table":
263                if child._type == "table":
264                    if child.number > 0:
265                        # Collect tables with the same number together
266                        content_tables[child.number].append(child)
267                        if document._page._template == "blue_gray":
268                            last_number = child.number
269                    elif last_number > 0:
270                        # Tables without caption may follow
271                        content_tables[last_number].append(child)
272                    _push()
273                elif child._type == "register":
274                    # Collect register tables that follow each other directly
275                    current_rtables.append(child)
276                elif child._type == "bits":
277                    # Collect bits tables that follow each other directly
278                    current_bitstables.append(child)
279                else:
280                    last_number = 0
281            else:
282                _push()
283                last_number = 0
284        _push()
285        last_number = 0
286    _push()
287
288    # Merge all tables of the same number by appending at the bottom
289    for number, tables in content_tables.items():
290        for table in tables[1:]:
291            print(f"T{table.obj._page.number} ", end="")
292            if tables[0].obj.append_bottom(table.obj):
293                table.parent = None
294    # Merge all register tables by appending to the right
295    for tables in register_tables:
296        for table in tables[1:]:
297            if tables[0].obj.append_side(table.obj, expand=True):
298                table.parent = None
299    # Merge all bits tables by appending at the bottom
300    for tables in bits_tables:
301        for table in tables[1:]:
302            if tables[0].obj.append_bottom(table.obj, merge_headers=False):
303                table.parent = None
304
305    return document
306
307
308def normalize_chapters(document: Node) -> Node:
309    headings = anytree.search.findall(document, filter_=lambda n: n.name in ["head1", "head2"], maxlevel=3)
310    idxs = [document.children.index(h.parent) for h in headings] + [len(document.children)]
311    if idxs[0] != 0:
312        idxs = [0] + idxs
313    if idxs[-1] != len(document.children):
314        idxs += [len(document.children)]
315
316    cleaner = str.maketrans(" /()-,:", "_______")
317
318    chapters = []
319    for idx0, idx1 in zip(idxs, idxs[1:]):
320        # Find the chapter name
321        heading = document.children[idx0].children[0]
322        lines = anytree.search.findall(heading, filter_=lambda n: n.name == "line")
323        chapter_name = ("".join(c.char for c in line.obj.chars).strip() for line in lines)
324        chapter_name = " ".join(chapter_name)
325        if heading.name == "head1":
326            chapter_name = "0 " + chapter_name
327        filename = chapter_name.lower().translate(cleaner)
328        chapters.append((chapter_name, filename, document.children[idx0 : idx1 + 1]))
329
330    for title, filename, nodes in chapters:
331        chapter = Node("chapter", title=title, _filename=filename, parent=document)
332        for node in nodes:
333            node.parent = chapter
334
335    return document
def merge_area( document: anytree.node.node.Node, area: anytree.node.node.Node, debug: bool = False) -> anytree.node.node.Node:
27def merge_area(document: Node, area: Node, debug: bool = False) -> Node:
28    if document is None:
29        document = Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None)
30        document._end = document
31    if not area.children:
32        return document
33    if debug:
34        _LOGGER.debug()
35
36    def _find_end(node):
37        # Find the last leaf node but skip lines, paragraphs, captions/tables/figures
38        return next(
39            (c for c in ReversePreOrderIter(node) if any(c.name.startswith(name) for name in {"head", "list", "note"})),
40            next(ReversePreOrderIter(node), node),
41        )
42
43    def _find_ancestor(filter_):
44        if filter_(document._end):
45            return document._end
46        return next((c for c in document._end.iter_path_reverse() if filter_(c)), document.root)
47
48    area = _normalize_area(area)
49    if debug:
50        _LOGGER.debug(RenderTree(area))
51    children = area.children
52    # All area nodes up to the next top-level element must now be
53    # xpos-aligned with the previous area's last leaf node
54    connect_index = next((ii for ii, c in enumerate(children) if c.name.startswith("head")), len(children))
55    x_em = area.page._spacing["x_em"]
56
57    if debug:
58        _LOGGER.debug("area=", area, "connect_index=", connect_index)
59    # Align these children with the last leaf node xpos
60    for child in children[:connect_index]:
61        if any(child.name.startswith(name) for name in {"list"}):
62            # Find the node that is left of the current node but not too far left
63            host = _find_ancestor(lambda c: -4 * x_em < (c.xpos - child.xpos) < -x_em or c.name.startswith("head"))
64        elif (
65            child.name == "para"
66            and document._end.name == "note"
67            and child.children[0].obj.contains_font("Italic", "Oblique")
68        ):
69            host = document._end
70        else:
71            # Insert underneath the next heading
72            host = _find_ancestor(lambda c: c.name.startswith("head"))
73
74        child.parent = host
75        document._end = _find_end(document)
76        if debug:
77            _LOGGER.debug(
78                f"{child=}",
79            )
80            _LOGGER.debug(f"{host=}")
81            _LOGGER.debug(f"end={document._end}")
82            _LOGGER.debug()
83
84    # Add the remaining top-level children to connect index node
85    if connect_index < len(children):
86        children[connect_index].parent = document
87        for child in children[connect_index + 1 :]:
88            child.parent = children[connect_index]
89
90    document._end = _find_end(document)
91
92    if debug:
93        _LOGGER.debug()
94        _LOGGER.debug()
95
96    return document
def normalize_lists(node: anytree.node.node.Node) -> anytree.node.node.Node:
 99def normalize_lists(node: Node) -> Node:
100    lists = []
101    current = []
102    current_name = None
103    for child in node.children:
104        # Normalize the lists from the leaves up
105        normalize_lists(child)
106        # then split the children based on their names
107        if current_name is None or child.name == current_name:
108            current.append(child)
109        else:
110            lists.append(current)
111            current = [child]
112        current_name = child.name
113    if current:
114        lists.append(current)
115
116    # Create a new list of children
117    new_children = []
118    for llist in lists:
119        # Insert a new list group node and redirect all children to it
120        if llist[0].name.startswith("list"):
121            nlist = Node(llist[0].name, obj=llist[0].obj, start=llist[0].value, xpos=llist[0].xpos)
122            for lnode in llist:
123                lnode.name = "element"
124                lnode.parent = nlist
125
126            new_children.append(nlist)
127        else:
128            new_children.extend(llist)
129
130    # Set the new children which have the same order
131    node.children = new_children
132    return node
def normalize_paragraphs(document: anytree.node.node.Node) -> anytree.node.node.Node:
135def normalize_paragraphs(document: Node) -> Node:
136    paras = anytree.search.findall(document, filter_=lambda n: n.name == "para")
137    parents = set(p.parent for p in paras if p.parent.name in {"element", "caption", "document", "cell"})
138    for parent in parents:
139        # Replace the paragraph only if it's the *only* paragraph in this node
140        if parent.name in {"caption"} or sum(1 for p in parent.children if p.name == "para") == 1:
141            # Replace like this to preserve children order
142            parent.children = [p.children[0] if p.name == "para" else p for p in parent.children]
143            # Now we need to merge the text tags into the first one
144            texts = [p for p in parent.children if p.name == "text"]
145            if len(texts) > 1:
146                first_text = texts[0]
147                for text in texts[1:]:
148                    for line in text.children:
149                        line.parent = first_text
150                    text.parent = None
151    return document
def normalize_lines(document: anytree.node.node.Node) -> anytree.node.node.Node:
154def normalize_lines(document: Node) -> Node:
155    paras = anytree.search.findall(document, filter_=lambda n: n.name == "para")
156    for para in paras:
157        text = Node("text")
158        for line in para.children:
159            line.parent = text
160        para.children = [text]
161    return document
def normalize_captions(document: anytree.node.node.Node) -> anytree.node.node.Node:
164def normalize_captions(document: Node) -> Node:
165    captions = anytree.search.findall(document, filter_=lambda n: n.name == "caption")
166    for caption in captions:
167        cindex = caption.parent.children.index(caption)
168        # Find the next table for this caption within 5 nodes
169        for sibling in caption.parent.children[cindex : cindex + 6]:
170            if sibling.name == caption._type:
171                caption.parent = sibling
172                sibling.number = caption.number
173                break
174        else:
175            _LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}")
176            caption.parent = None
177    return document
def normalize_headings(document: anytree.node.node.Node) -> anytree.node.node.Node:
180def normalize_headings(document: Node) -> Node:
181    headings = anytree.search.findall(document, filter_=lambda n: n.name.startswith("head"))
182    for heading in headings:
183        para = heading.children[0]
184        if not para.children[0].children:
185            # Remove empty headers
186            para.parent = None
187        else:
188            # Rename paragraph to heading
189            para.__dict__["marker"] = heading.marker
190            para.name = heading.name
191        heading.name = "section"
192    return document
def normalize_registers(document: anytree.node.node.Node) -> anytree.node.node.Node:
195def normalize_registers(document: Node) -> Node:
196    bits_list = []
197    sections = anytree.search.findall(document, filter_=lambda n: n.name == "section")
198    for section in sections + (document,):
199        new_children = []
200        bits = None
201        for child in section.children:
202            if child.name == "bit":
203                # Insert a new bits group node and redirect all children to it
204                if bits is None or bits._page != child._page:
205                    bits = Node("table", xpos=child.xpos, obj=None, _type="bits", _width=1, _page=child._page)
206                    new_children.append(bits)
207                    bits_list.append(bits)
208                child.parent = bits
209            else:
210                bits = None
211                new_children.append(child)
212        # Set the new children which have the same order
213        section.children = new_children
214
215    # Reformat the bits nodes into tables
216    for bits in bits_list:
217        cells = []
218        for ypos, bit in enumerate(bits.children):
219            bit.parent = None
220            # The top is the first line, the bottom by the last line
221            top = next(c.obj.bbox.top for c in bit.descendants if c.name == "line")
222            bottom = next(c.obj.bbox.bottom for c in reversed(bit.descendants) if c.name == "line")
223            # Left table cell contains Bits
224            left_bbox = Rectangle(bit._left, bottom, bit._middle, top)
225            cells.append(Cell(None, (ypos, 0), left_bbox, (1, 1, 1, 1), is_simple=True))
226            # Right cell contains description
227            right_bbox = Rectangle(bit._middle, bottom, bit._right, top)
228            cells.append(Cell(None, (ypos, 1), right_bbox, (1, 1, 1, 1)))
229        tbbox = Rectangle(
230            min(c.bbox.left for c in cells),
231            min(c.bbox.bottom for c in cells),
232            max(c.bbox.right for c in cells),
233            max(c.bbox.top for c in cells),
234        )
235        bits.obj = VirtualTable(bits._page, tbbox, cells, "bitfield")
236
237    return document
def normalize_tables(document: anytree.node.node.Node) -> anytree.node.node.Node:
240def normalize_tables(document: Node) -> Node:
241    content_tables = defaultdict(list)
242    register_tables = []
243    bits_tables = []
244    current_rtables = []
245    current_bitstables = []
246
247    def _push():
248        nonlocal current_rtables, register_tables
249        nonlocal current_bitstables, bits_tables
250        if current_rtables:
251            register_tables.append(current_rtables)
252            current_rtables = []
253        if current_bitstables:
254            bits_tables.append(current_bitstables)
255            current_bitstables = []
256
257    sections = anytree.search.findall(document, filter_=lambda n: n.name == "section")
258    last_number = 0
259    for section in sections + (document,):
260        current_rtables = []
261        current_bitstables = []
262        for child in section.children:
263            if child.name == "table":
264                if child._type == "table":
265                    if child.number > 0:
266                        # Collect tables with the same number together
267                        content_tables[child.number].append(child)
268                        if document._page._template == "blue_gray":
269                            last_number = child.number
270                    elif last_number > 0:
271                        # Tables without caption may follow
272                        content_tables[last_number].append(child)
273                    _push()
274                elif child._type == "register":
275                    # Collect register tables that follow each other directly
276                    current_rtables.append(child)
277                elif child._type == "bits":
278                    # Collect bits tables that follow each other directly
279                    current_bitstables.append(child)
280                else:
281                    last_number = 0
282            else:
283                _push()
284                last_number = 0
285        _push()
286        last_number = 0
287    _push()
288
289    # Merge all tables of the same number by appending at the bottom
290    for number, tables in content_tables.items():
291        for table in tables[1:]:
292            print(f"T{table.obj._page.number} ", end="")
293            if tables[0].obj.append_bottom(table.obj):
294                table.parent = None
295    # Merge all register tables by appending to the right
296    for tables in register_tables:
297        for table in tables[1:]:
298            if tables[0].obj.append_side(table.obj, expand=True):
299                table.parent = None
300    # Merge all bits tables by appending at the bottom
301    for tables in bits_tables:
302        for table in tables[1:]:
303            if tables[0].obj.append_bottom(table.obj, merge_headers=False):
304                table.parent = None
305
306    return document
def normalize_chapters(document: anytree.node.node.Node) -> anytree.node.node.Node:
309def normalize_chapters(document: Node) -> Node:
310    headings = anytree.search.findall(document, filter_=lambda n: n.name in ["head1", "head2"], maxlevel=3)
311    idxs = [document.children.index(h.parent) for h in headings] + [len(document.children)]
312    if idxs[0] != 0:
313        idxs = [0] + idxs
314    if idxs[-1] != len(document.children):
315        idxs += [len(document.children)]
316
317    cleaner = str.maketrans(" /()-,:", "_______")
318
319    chapters = []
320    for idx0, idx1 in zip(idxs, idxs[1:]):
321        # Find the chapter name
322        heading = document.children[idx0].children[0]
323        lines = anytree.search.findall(heading, filter_=lambda n: n.name == "line")
324        chapter_name = ("".join(c.char for c in line.obj.chars).strip() for line in lines)
325        chapter_name = " ".join(chapter_name)
326        if heading.name == "head1":
327            chapter_name = "0 " + chapter_name
328        filename = chapter_name.lower().translate(cleaner)
329        chapters.append((chapter_name, filename, document.children[idx0 : idx1 + 1]))
330
331    for title, filename, nodes in chapters:
332        chapter = Node("chapter", title=title, _filename=filename, parent=document)
333        for node in nodes:
334            node.parent = chapter
335
336    return document