modm_data.pdf2html.ast
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4import logging 5import anytree 6from anytree import RenderTree, Node 7from collections import defaultdict 8from ..utils import Rectangle, ReversePreOrderIter 9from .table import VirtualTable, Cell 10 11_LOGGER = logging.getLogger(__name__) 12 13 14def _normalize_area(area: Node) -> Node: 15 for child in ReversePreOrderIter(area): 16 if child.name.startswith("list"): 17 # We need to normalize the xpos back to the first character 18 child.xpos = int(child.obj.bbox.left) - area.xpos 19 else: 20 # And then make the xpos relative to the area left for consistent comparisons 21 child.xpos -= area.xpos 22 area.xpos = 0 23 return area 24 25 26def merge_area(document: Node, area: Node, debug: bool = False) -> Node: 27 if document is None: 28 document = Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None) 29 document._end = document 30 if not area.children: 31 return document 32 if debug: 33 _LOGGER.debug() 34 35 def _find_end(node): 36 # Find the last leaf node but skip lines, paragraphs, captions/tables/figures 37 return next( 38 (c for c in ReversePreOrderIter(node) if any(c.name.startswith(name) for name in {"head", "list", "note"})), 39 next(ReversePreOrderIter(node), node), 40 ) 41 42 def _find_ancestor(filter_): 43 if filter_(document._end): 44 return document._end 45 return next((c for c in document._end.iter_path_reverse() if filter_(c)), document.root) 46 47 area = _normalize_area(area) 48 if debug: 49 _LOGGER.debug(RenderTree(area)) 50 children = area.children 51 # All area nodes up to the next top-level element must now be 52 # xpos-aligned with the previous area's last leaf node 53 connect_index = next((ii for ii, c in enumerate(children) if c.name.startswith("head")), len(children)) 54 x_em = area.page._spacing["x_em"] 55 56 if debug: 57 _LOGGER.debug("area=", area, "connect_index=", connect_index) 58 # Align these children with the last leaf node xpos 59 for child in children[:connect_index]: 60 if any(child.name.startswith(name) for name in {"list"}): 61 # Find the node that is left of the current node but not too far left 62 host = _find_ancestor(lambda c: -4 * x_em < (c.xpos - child.xpos) < -x_em or c.name.startswith("head")) 63 elif ( 64 child.name == "para" 65 and document._end.name == "note" 66 and child.children[0].obj.contains_font("Italic", "Oblique") 67 ): 68 host = document._end 69 else: 70 # Insert underneath the next heading 71 host = _find_ancestor(lambda c: c.name.startswith("head")) 72 73 child.parent = host 74 document._end = _find_end(document) 75 if debug: 76 _LOGGER.debug( 77 f"{child=}", 78 ) 79 _LOGGER.debug(f"{host=}") 80 _LOGGER.debug(f"end={document._end}") 81 _LOGGER.debug() 82 83 # Add the remaining top-level children to connect index node 84 if connect_index < len(children): 85 children[connect_index].parent = document 86 for child in children[connect_index + 1 :]: 87 child.parent = children[connect_index] 88 89 document._end = _find_end(document) 90 91 if debug: 92 _LOGGER.debug() 93 _LOGGER.debug() 94 95 return document 96 97 98def normalize_lists(node: Node) -> Node: 99 lists = [] 100 current = [] 101 current_name = None 102 for child in node.children: 103 # Normalize the lists from the leaves up 104 normalize_lists(child) 105 # then split the children based on their names 106 if current_name is None or child.name == current_name: 107 current.append(child) 108 else: 109 lists.append(current) 110 current = [child] 111 current_name = child.name 112 if current: 113 lists.append(current) 114 115 # Create a new list of children 116 new_children = [] 117 for llist in lists: 118 # Insert a new list group node and redirect all children to it 119 if llist[0].name.startswith("list"): 120 nlist = Node(llist[0].name, obj=llist[0].obj, start=llist[0].value, xpos=llist[0].xpos) 121 for lnode in llist: 122 lnode.name = "element" 123 lnode.parent = nlist 124 125 new_children.append(nlist) 126 else: 127 new_children.extend(llist) 128 129 # Set the new children which have the same order 130 node.children = new_children 131 return node 132 133 134def normalize_paragraphs(document: Node) -> Node: 135 paras = anytree.search.findall(document, filter_=lambda n: n.name == "para") 136 parents = set(p.parent for p in paras if p.parent.name in {"element", "caption", "document", "cell"}) 137 for parent in parents: 138 # Replace the paragraph only if it's the *only* paragraph in this node 139 if parent.name in {"caption"} or sum(1 for p in parent.children if p.name == "para") == 1: 140 # Replace like this to preserve children order 141 parent.children = [p.children[0] if p.name == "para" else p for p in parent.children] 142 # Now we need to merge the text tags into the first one 143 texts = [p for p in parent.children if p.name == "text"] 144 if len(texts) > 1: 145 first_text = texts[0] 146 for text in texts[1:]: 147 for line in text.children: 148 line.parent = first_text 149 text.parent = None 150 return document 151 152 153def normalize_lines(document: Node) -> Node: 154 paras = anytree.search.findall(document, filter_=lambda n: n.name == "para") 155 for para in paras: 156 text = Node("text") 157 for line in para.children: 158 line.parent = text 159 para.children = [text] 160 return document 161 162 163def normalize_captions(document: Node) -> Node: 164 captions = anytree.search.findall(document, filter_=lambda n: n.name == "caption") 165 for caption in captions: 166 cindex = caption.parent.children.index(caption) 167 # Find the next table for this caption within 5 nodes 168 for sibling in caption.parent.children[cindex : cindex + 6]: 169 if sibling.name == caption._type: 170 caption.parent = sibling 171 sibling.number = caption.number 172 break 173 else: 174 _LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}") 175 caption.parent = None 176 return document 177 178 179def normalize_headings(document: Node) -> Node: 180 headings = anytree.search.findall(document, filter_=lambda n: n.name.startswith("head")) 181 for heading in headings: 182 para = heading.children[0] 183 if not para.children[0].children: 184 # Remove empty headers 185 para.parent = None 186 else: 187 # Rename paragraph to heading 188 para.__dict__["marker"] = heading.marker 189 para.name = heading.name 190 heading.name = "section" 191 return document 192 193 194def normalize_registers(document: Node) -> Node: 195 bits_list = [] 196 sections = anytree.search.findall(document, filter_=lambda n: n.name == "section") 197 for section in sections + (document,): 198 new_children = [] 199 bits = None 200 for child in section.children: 201 if child.name == "bit": 202 # Insert a new bits group node and redirect all children to it 203 if bits is None or bits._page != child._page: 204 bits = Node("table", xpos=child.xpos, obj=None, _type="bits", _width=1, _page=child._page) 205 new_children.append(bits) 206 bits_list.append(bits) 207 child.parent = bits 208 else: 209 bits = None 210 new_children.append(child) 211 # Set the new children which have the same order 212 section.children = new_children 213 214 # Reformat the bits nodes into tables 215 for bits in bits_list: 216 cells = [] 217 for ypos, bit in enumerate(bits.children): 218 bit.parent = None 219 # The top is the first line, the bottom by the last line 220 top = next(c.obj.bbox.top for c in bit.descendants if c.name == "line") 221 bottom = next(c.obj.bbox.bottom for c in reversed(bit.descendants) if c.name == "line") 222 # Left table cell contains Bits 223 left_bbox = Rectangle(bit._left, bottom, bit._middle, top) 224 cells.append(Cell(None, (ypos, 0), left_bbox, (1, 1, 1, 1), is_simple=True)) 225 # Right cell contains description 226 right_bbox = Rectangle(bit._middle, bottom, bit._right, top) 227 cells.append(Cell(None, (ypos, 1), right_bbox, (1, 1, 1, 1))) 228 tbbox = Rectangle( 229 min(c.bbox.left for c in cells), 230 min(c.bbox.bottom for c in cells), 231 max(c.bbox.right for c in cells), 232 max(c.bbox.top for c in cells), 233 ) 234 bits.obj = VirtualTable(bits._page, tbbox, cells, "bitfield") 235 236 return document 237 238 239def normalize_tables(document: Node) -> Node: 240 content_tables = defaultdict(list) 241 register_tables = [] 242 bits_tables = [] 243 current_rtables = [] 244 current_bitstables = [] 245 246 def _push(): 247 nonlocal current_rtables, register_tables 248 nonlocal current_bitstables, bits_tables 249 if current_rtables: 250 register_tables.append(current_rtables) 251 current_rtables = [] 252 if current_bitstables: 253 bits_tables.append(current_bitstables) 254 current_bitstables = [] 255 256 sections = anytree.search.findall(document, filter_=lambda n: n.name == "section") 257 last_number = 0 258 for section in sections + (document,): 259 current_rtables = [] 260 current_bitstables = [] 261 for child in section.children: 262 if child.name == "table": 263 if child._type == "table": 264 if child.number > 0: 265 # Collect tables with the same number together 266 content_tables[child.number].append(child) 267 if document._page._template == "blue_gray": 268 last_number = child.number 269 elif last_number > 0: 270 # Tables without caption may follow 271 content_tables[last_number].append(child) 272 _push() 273 elif child._type == "register": 274 # Collect register tables that follow each other directly 275 current_rtables.append(child) 276 elif child._type == "bits": 277 # Collect bits tables that follow each other directly 278 current_bitstables.append(child) 279 else: 280 last_number = 0 281 else: 282 _push() 283 last_number = 0 284 _push() 285 last_number = 0 286 _push() 287 288 # Merge all tables of the same number by appending at the bottom 289 for number, tables in content_tables.items(): 290 for table in tables[1:]: 291 print(f"T{table.obj._page.number} ", end="") 292 if tables[0].obj.append_bottom(table.obj): 293 table.parent = None 294 # Merge all register tables by appending to the right 295 for tables in register_tables: 296 for table in tables[1:]: 297 if tables[0].obj.append_side(table.obj, expand=True): 298 table.parent = None 299 # Merge all bits tables by appending at the bottom 300 for tables in bits_tables: 301 for table in tables[1:]: 302 if tables[0].obj.append_bottom(table.obj, merge_headers=False): 303 table.parent = None 304 305 return document 306 307 308def normalize_chapters(document: Node) -> Node: 309 headings = anytree.search.findall(document, filter_=lambda n: n.name in ["head1", "head2"], maxlevel=3) 310 idxs = [document.children.index(h.parent) for h in headings] + [len(document.children)] 311 if idxs[0] != 0: 312 idxs = [0] + idxs 313 if idxs[-1] != len(document.children): 314 idxs += [len(document.children)] 315 316 cleaner = str.maketrans(" /()-,:", "_______") 317 318 chapters = [] 319 for idx0, idx1 in zip(idxs, idxs[1:]): 320 # Find the chapter name 321 heading = document.children[idx0].children[0] 322 lines = anytree.search.findall(heading, filter_=lambda n: n.name == "line") 323 chapter_name = ("".join(c.char for c in line.obj.chars).strip() for line in lines) 324 chapter_name = " ".join(chapter_name) 325 if heading.name == "head1": 326 chapter_name = "0 " + chapter_name 327 filename = chapter_name.lower().translate(cleaner) 328 chapters.append((chapter_name, filename, document.children[idx0 : idx1 + 1])) 329 330 for title, filename, nodes in chapters: 331 chapter = Node("chapter", title=title, _filename=filename, parent=document) 332 for node in nodes: 333 node.parent = chapter 334 335 return document
def
merge_area( document: anytree.node.node.Node, area: anytree.node.node.Node, debug: bool = False) -> anytree.node.node.Node:
27def merge_area(document: Node, area: Node, debug: bool = False) -> Node: 28 if document is None: 29 document = Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None) 30 document._end = document 31 if not area.children: 32 return document 33 if debug: 34 _LOGGER.debug() 35 36 def _find_end(node): 37 # Find the last leaf node but skip lines, paragraphs, captions/tables/figures 38 return next( 39 (c for c in ReversePreOrderIter(node) if any(c.name.startswith(name) for name in {"head", "list", "note"})), 40 next(ReversePreOrderIter(node), node), 41 ) 42 43 def _find_ancestor(filter_): 44 if filter_(document._end): 45 return document._end 46 return next((c for c in document._end.iter_path_reverse() if filter_(c)), document.root) 47 48 area = _normalize_area(area) 49 if debug: 50 _LOGGER.debug(RenderTree(area)) 51 children = area.children 52 # All area nodes up to the next top-level element must now be 53 # xpos-aligned with the previous area's last leaf node 54 connect_index = next((ii for ii, c in enumerate(children) if c.name.startswith("head")), len(children)) 55 x_em = area.page._spacing["x_em"] 56 57 if debug: 58 _LOGGER.debug("area=", area, "connect_index=", connect_index) 59 # Align these children with the last leaf node xpos 60 for child in children[:connect_index]: 61 if any(child.name.startswith(name) for name in {"list"}): 62 # Find the node that is left of the current node but not too far left 63 host = _find_ancestor(lambda c: -4 * x_em < (c.xpos - child.xpos) < -x_em or c.name.startswith("head")) 64 elif ( 65 child.name == "para" 66 and document._end.name == "note" 67 and child.children[0].obj.contains_font("Italic", "Oblique") 68 ): 69 host = document._end 70 else: 71 # Insert underneath the next heading 72 host = _find_ancestor(lambda c: c.name.startswith("head")) 73 74 child.parent = host 75 document._end = _find_end(document) 76 if debug: 77 _LOGGER.debug( 78 f"{child=}", 79 ) 80 _LOGGER.debug(f"{host=}") 81 _LOGGER.debug(f"end={document._end}") 82 _LOGGER.debug() 83 84 # Add the remaining top-level children to connect index node 85 if connect_index < len(children): 86 children[connect_index].parent = document 87 for child in children[connect_index + 1 :]: 88 child.parent = children[connect_index] 89 90 document._end = _find_end(document) 91 92 if debug: 93 _LOGGER.debug() 94 _LOGGER.debug() 95 96 return document
def
normalize_lists(node: anytree.node.node.Node) -> anytree.node.node.Node:
99def normalize_lists(node: Node) -> Node: 100 lists = [] 101 current = [] 102 current_name = None 103 for child in node.children: 104 # Normalize the lists from the leaves up 105 normalize_lists(child) 106 # then split the children based on their names 107 if current_name is None or child.name == current_name: 108 current.append(child) 109 else: 110 lists.append(current) 111 current = [child] 112 current_name = child.name 113 if current: 114 lists.append(current) 115 116 # Create a new list of children 117 new_children = [] 118 for llist in lists: 119 # Insert a new list group node and redirect all children to it 120 if llist[0].name.startswith("list"): 121 nlist = Node(llist[0].name, obj=llist[0].obj, start=llist[0].value, xpos=llist[0].xpos) 122 for lnode in llist: 123 lnode.name = "element" 124 lnode.parent = nlist 125 126 new_children.append(nlist) 127 else: 128 new_children.extend(llist) 129 130 # Set the new children which have the same order 131 node.children = new_children 132 return node
def
normalize_paragraphs(document: anytree.node.node.Node) -> anytree.node.node.Node:
135def normalize_paragraphs(document: Node) -> Node: 136 paras = anytree.search.findall(document, filter_=lambda n: n.name == "para") 137 parents = set(p.parent for p in paras if p.parent.name in {"element", "caption", "document", "cell"}) 138 for parent in parents: 139 # Replace the paragraph only if it's the *only* paragraph in this node 140 if parent.name in {"caption"} or sum(1 for p in parent.children if p.name == "para") == 1: 141 # Replace like this to preserve children order 142 parent.children = [p.children[0] if p.name == "para" else p for p in parent.children] 143 # Now we need to merge the text tags into the first one 144 texts = [p for p in parent.children if p.name == "text"] 145 if len(texts) > 1: 146 first_text = texts[0] 147 for text in texts[1:]: 148 for line in text.children: 149 line.parent = first_text 150 text.parent = None 151 return document
def
normalize_lines(document: anytree.node.node.Node) -> anytree.node.node.Node:
def
normalize_headings(document: anytree.node.node.Node) -> anytree.node.node.Node:
180def normalize_headings(document: Node) -> Node: 181 headings = anytree.search.findall(document, filter_=lambda n: n.name.startswith("head")) 182 for heading in headings: 183 para = heading.children[0] 184 if not para.children[0].children: 185 # Remove empty headers 186 para.parent = None 187 else: 188 # Rename paragraph to heading 189 para.__dict__["marker"] = heading.marker 190 para.name = heading.name 191 heading.name = "section" 192 return document
def
normalize_registers(document: anytree.node.node.Node) -> anytree.node.node.Node:
195def normalize_registers(document: Node) -> Node: 196 bits_list = [] 197 sections = anytree.search.findall(document, filter_=lambda n: n.name == "section") 198 for section in sections + (document,): 199 new_children = [] 200 bits = None 201 for child in section.children: 202 if child.name == "bit": 203 # Insert a new bits group node and redirect all children to it 204 if bits is None or bits._page != child._page: 205 bits = Node("table", xpos=child.xpos, obj=None, _type="bits", _width=1, _page=child._page) 206 new_children.append(bits) 207 bits_list.append(bits) 208 child.parent = bits 209 else: 210 bits = None 211 new_children.append(child) 212 # Set the new children which have the same order 213 section.children = new_children 214 215 # Reformat the bits nodes into tables 216 for bits in bits_list: 217 cells = [] 218 for ypos, bit in enumerate(bits.children): 219 bit.parent = None 220 # The top is the first line, the bottom by the last line 221 top = next(c.obj.bbox.top for c in bit.descendants if c.name == "line") 222 bottom = next(c.obj.bbox.bottom for c in reversed(bit.descendants) if c.name == "line") 223 # Left table cell contains Bits 224 left_bbox = Rectangle(bit._left, bottom, bit._middle, top) 225 cells.append(Cell(None, (ypos, 0), left_bbox, (1, 1, 1, 1), is_simple=True)) 226 # Right cell contains description 227 right_bbox = Rectangle(bit._middle, bottom, bit._right, top) 228 cells.append(Cell(None, (ypos, 1), right_bbox, (1, 1, 1, 1))) 229 tbbox = Rectangle( 230 min(c.bbox.left for c in cells), 231 min(c.bbox.bottom for c in cells), 232 max(c.bbox.right for c in cells), 233 max(c.bbox.top for c in cells), 234 ) 235 bits.obj = VirtualTable(bits._page, tbbox, cells, "bitfield") 236 237 return document
def
normalize_tables(document: anytree.node.node.Node) -> anytree.node.node.Node:
240def normalize_tables(document: Node) -> Node: 241 content_tables = defaultdict(list) 242 register_tables = [] 243 bits_tables = [] 244 current_rtables = [] 245 current_bitstables = [] 246 247 def _push(): 248 nonlocal current_rtables, register_tables 249 nonlocal current_bitstables, bits_tables 250 if current_rtables: 251 register_tables.append(current_rtables) 252 current_rtables = [] 253 if current_bitstables: 254 bits_tables.append(current_bitstables) 255 current_bitstables = [] 256 257 sections = anytree.search.findall(document, filter_=lambda n: n.name == "section") 258 last_number = 0 259 for section in sections + (document,): 260 current_rtables = [] 261 current_bitstables = [] 262 for child in section.children: 263 if child.name == "table": 264 if child._type == "table": 265 if child.number > 0: 266 # Collect tables with the same number together 267 content_tables[child.number].append(child) 268 if document._page._template == "blue_gray": 269 last_number = child.number 270 elif last_number > 0: 271 # Tables without caption may follow 272 content_tables[last_number].append(child) 273 _push() 274 elif child._type == "register": 275 # Collect register tables that follow each other directly 276 current_rtables.append(child) 277 elif child._type == "bits": 278 # Collect bits tables that follow each other directly 279 current_bitstables.append(child) 280 else: 281 last_number = 0 282 else: 283 _push() 284 last_number = 0 285 _push() 286 last_number = 0 287 _push() 288 289 # Merge all tables of the same number by appending at the bottom 290 for number, tables in content_tables.items(): 291 for table in tables[1:]: 292 print(f"T{table.obj._page.number} ", end="") 293 if tables[0].obj.append_bottom(table.obj): 294 table.parent = None 295 # Merge all register tables by appending to the right 296 for tables in register_tables: 297 for table in tables[1:]: 298 if tables[0].obj.append_side(table.obj, expand=True): 299 table.parent = None 300 # Merge all bits tables by appending at the bottom 301 for tables in bits_tables: 302 for table in tables[1:]: 303 if tables[0].obj.append_bottom(table.obj, merge_headers=False): 304 table.parent = None 305 306 return document
def
normalize_chapters(document: anytree.node.node.Node) -> anytree.node.node.Node:
309def normalize_chapters(document: Node) -> Node: 310 headings = anytree.search.findall(document, filter_=lambda n: n.name in ["head1", "head2"], maxlevel=3) 311 idxs = [document.children.index(h.parent) for h in headings] + [len(document.children)] 312 if idxs[0] != 0: 313 idxs = [0] + idxs 314 if idxs[-1] != len(document.children): 315 idxs += [len(document.children)] 316 317 cleaner = str.maketrans(" /()-,:", "_______") 318 319 chapters = [] 320 for idx0, idx1 in zip(idxs, idxs[1:]): 321 # Find the chapter name 322 heading = document.children[idx0].children[0] 323 lines = anytree.search.findall(heading, filter_=lambda n: n.name == "line") 324 chapter_name = ("".join(c.char for c in line.obj.chars).strip() for line in lines) 325 chapter_name = " ".join(chapter_name) 326 if heading.name == "head1": 327 chapter_name = "0 " + chapter_name 328 filename = chapter_name.lower().translate(cleaner) 329 chapters.append((chapter_name, filename, document.children[idx0 : idx1 + 1])) 330 331 for title, filename, nodes in chapters: 332 chapter = Node("chapter", title=title, _filename=filename, parent=document) 333 for node in nodes: 334 node.parent = chapter 335 336 return document