modm_data.pdf2html.stmicro.ast
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4import logging 5from lxml import etree 6import anytree 7from anytree import RenderTree 8from collections import defaultdict 9from ...utils import list_strip, Rectangle, ReversePreOrderIter 10from .table import VirtualTable, TableCell 11 12LOGGER = logging.getLogger(__name__) 13 14 15def _normalize_area(area): 16 for child in ReversePreOrderIter(area): 17 if child.name.startswith("list"): 18 # We need to normalize the xpos back to the first character 19 child.xpos = int(child.obj.bbox.left) - area.xpos 20 else: 21 # And then make the xpos relative to the area left for consistent comparisons 22 child.xpos -= area.xpos 23 area.xpos = 0 24 return area 25 26 27def merge_area(document, area, debug=False): 28 if document is None: 29 document = anytree.Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None) 30 document._end = document 31 if not area.children: 32 return document 33 if debug: print() 34 35 def _find_end(node): 36 # Find the last leaf node but skip lines, paragraphs, captions/tables/figures 37 return next((c for c in ReversePreOrderIter(node) 38 if any(c.name.startswith(name) for name in {"head", "list", "note"})), 39 next(ReversePreOrderIter(node), node)) 40 def _find_ancestor(filter_): 41 if filter_(document._end): return document._end 42 return next((c for c in document._end.iter_path_reverse() 43 if filter_(c)), document.root) 44 45 area = _normalize_area(area) 46 if debug: print(RenderTree(area)) 47 children = area.children 48 # All area nodes up to the next top-level element must now be 49 # xpos-aligned with the previous area's last leaf node 50 connect_index = next((ii for ii, c in enumerate(children) 51 if c.name.startswith("head")), len(children)) 52 x_em = area.page._spacing["x_em"] 53 54 if debug: print("area=", area, "connect_index=", connect_index) 55 # Align these children with the last leaf node xpos 56 for child in children[:connect_index]: 57 if any(child.name.startswith(name) for name in {"list"}): 58 # Find the node that is left of the current node but not too far left 59 host = _find_ancestor(lambda c: -4 * x_em < (c.xpos - child.xpos) < -x_em or 60 c.name.startswith("head")) 61 elif (child.name == "para" and document._end.name == "note" and 62 child.children[0].obj.contains_font("Italic", "Oblique")): 63 host = document._end 64 else: 65 # Insert underneath the next heading 66 host = _find_ancestor(lambda c: c.name.startswith("head")) 67 68 child.parent = host 69 document._end = _find_end(document) 70 if debug: 71 print("child=", child) 72 print("host=", host) 73 print("end=", document._end) 74 print() 75 76 # Add the remaining top-level children to connect index node 77 if connect_index < len(children): 78 children[connect_index].parent = document 79 for child in children[connect_index + 1:]: 80 child.parent = children[connect_index] 81 82 document._end = _find_end(document) 83 84 if debug: 85 print() 86 print() 87 88 return document 89 90 91def _normalize_lists(node): 92 lists = [] 93 current = [] 94 current_name = None 95 for child in node.children: 96 # Normalize the lists from the leaves up 97 _normalize_lists(child) 98 # then split the children based on their names 99 if current_name is None or child.name == current_name: 100 current.append(child) 101 else: 102 lists.append(current) 103 current = [child] 104 current_name = child.name 105 if current: 106 lists.append(current) 107 108 # Create a new list of children 109 new_children = [] 110 for llist in lists: 111 # Insert a new list group node and redirect all children to it 112 if llist[0].name.startswith("list"): 113 nlist = anytree.Node(llist[0].name, obj=llist[0].obj, 114 start=llist[0].value, xpos=llist[0].xpos) 115 for lnode in llist: 116 lnode.name = "element" 117 lnode.parent = nlist 118 119 new_children.append(nlist) 120 else: 121 new_children.extend(llist) 122 123 # Set the new children which have the same order 124 node.children = new_children 125 return node 126 127 128def _normalize_paragraphs(document): 129 paras = anytree.search.findall(document, filter_=lambda n: n.name == "para") 130 parents = set(p.parent for p in paras if p.parent.name in {"element", "caption", "document", "cell"}) 131 for parent in parents: 132 # Replace the paragraph only if it's the *only* paragraph in this node 133 if parent.name in {"caption"} or sum(1 for p in parent.children if p.name == "para") == 1: 134 # Replace like this to preserve children order 135 parent.children = [p.children[0] if p.name == "para" else p for p in parent.children] 136 # Now we need to merge the text tags into the first one 137 texts = [p for p in parent.children if p.name == "text"] 138 if len(texts) > 1: 139 first_text = texts[0] 140 for text in texts[1:]: 141 for line in text.children: 142 line.parent = first_text 143 text.parent = None 144 return document 145 146 147def _normalize_lines(document): 148 paras = anytree.search.findall(document, filter_=lambda n: n.name == "para") 149 for para in paras: 150 text = anytree.Node("text") 151 for line in para.children: 152 line.parent = text 153 para.children = [text] 154 return document 155 156 157def _normalize_captions(document): 158 captions = anytree.search.findall(document, filter_=lambda n: n.name == "caption") 159 for caption in captions: 160 cindex = caption.parent.children.index(caption) 161 # Find the next table for this caption within 5 nodes 162 for sibling in caption.parent.children[cindex:cindex + 6]: 163 if sibling.name == caption._type: 164 caption.parent = sibling 165 sibling.number = caption.number 166 break 167 else: 168 LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}") 169 caption.parent = None 170 return document 171 172 173def _normalize_headings(document): 174 headings = anytree.search.findall(document, filter_=lambda n: n.name.startswith("head")) 175 for heading in headings: 176 para = heading.children[0] 177 if not para.children[0].children: 178 # Remove empty headers 179 para.parent = None 180 else: 181 # Rename paragraph to heading 182 para.__dict__["marker"] = heading.marker 183 para.name = heading.name 184 heading.name = "section" 185 return document 186 187 188def _normalize_registers(document): 189 bits_list = [] 190 sections = anytree.search.findall(document, filter_=lambda n: n.name == "section") 191 for section in (sections + (document,)): 192 new_children = [] 193 bits = None 194 for child in section.children: 195 if child.name == "bit": 196 # Insert a new bits group node and redirect all children to it 197 if bits is None or bits._page != child._page: 198 bits = anytree.Node("table", xpos=child.xpos, obj=None, 199 _type="bits", _width=1, _page=child._page) 200 new_children.append(bits) 201 bits_list.append(bits) 202 child.parent = bits 203 else: 204 bits = None 205 new_children.append(child) 206 # Set the new children which have the same order 207 section.children = new_children 208 209 # Reformat the bits nodes into tables 210 for bits in bits_list: 211 cells = [] 212 for ypos, bit in enumerate(bits.children): 213 bit.parent = None 214 # The top is the first line, the bottom by the last line 215 top = next(c.obj.bbox.top for c in bit.descendants if c.name == "line") 216 bottom = next(c.obj.bbox.bottom for c in reversed(bit.descendants) if c.name == "line") 217 # Left table cell contains Bits 218 left_bbox = Rectangle(bit._left, bottom, bit._middle, top) 219 cells.append(TableCell(None, (ypos, 0), left_bbox, (1,1,1,1), is_simple=True)) 220 # Right cell contains description 221 right_bbox = Rectangle(bit._middle, bottom, bit._right, top) 222 cells.append(TableCell(None, (ypos, 1), right_bbox, (1,1,1,1))) 223 tbbox = Rectangle(min(c.bbox.left for c in cells), 224 min(c.bbox.bottom for c in cells), 225 max(c.bbox.right for c in cells), 226 max(c.bbox.top for c in cells)) 227 bits.obj = VirtualTable(bits._page, tbbox, cells, "bitfield") 228 229 return document 230 231 232def _normalize_tables(document): 233 content_tables = defaultdict(list) 234 register_tables = [] 235 bits_tables = [] 236 current_rtables = [] 237 current_bitstables = [] 238 239 def _push(): 240 nonlocal current_rtables, register_tables 241 nonlocal current_bitstables, bits_tables 242 if current_rtables: 243 register_tables.append(current_rtables) 244 current_rtables = [] 245 if current_bitstables: 246 bits_tables.append(current_bitstables) 247 current_bitstables = [] 248 249 sections = anytree.search.findall(document, filter_=lambda n: n.name == "section") 250 last_number = 0 251 for section in (sections + (document,)): 252 current_rtables = [] 253 current_bitstables = [] 254 for child in section.children: 255 if child.name == "table": 256 if child._type == "table": 257 if child.number > 0: 258 # Collect tables with the same number together 259 content_tables[child.number].append(child) 260 if document._page._template == "blue_gray": 261 last_number = child.number 262 elif last_number > 0: 263 # Tables without caption may follow 264 content_tables[last_number].append(child) 265 _push() 266 elif child._type == "register": 267 # Collect register tables that follow each other directly 268 current_rtables.append(child) 269 elif child._type == "bits": 270 # Collect bits tables that follow each other directly 271 current_bitstables.append(child) 272 else: 273 last_number = 0 274 else: 275 _push() 276 last_number = 0 277 _push() 278 last_number = 0 279 _push() 280 281 # Merge all tables of the same number by appending at the bottom 282 for number, tables in content_tables.items(): 283 for table in tables[1:]: 284 print(f"T{table.obj._page.number} ", end="") 285 if tables[0].obj.append_bottom(table.obj): 286 table.parent = None 287 # Merge all register tables by appending to the right 288 for tables in register_tables: 289 for table in tables[1:]: 290 if tables[0].obj.append_side(table.obj, expand=True): 291 table.parent = None 292 # Merge all bits tables by appending at the bottom 293 for tables in bits_tables: 294 for table in tables[1:]: 295 if tables[0].obj.append_bottom(table.obj, merge_headers=False): 296 table.parent = None 297 298 return document 299 300 301def _normalize_chapters(document) -> list: 302 headings = anytree.search.findall(document, filter_=lambda n: n.name in ["head1", "head2"], maxlevel=3) 303 idxs = [document.children.index(h.parent) for h in headings] + [len(document.children)] 304 if idxs[0] != 0: 305 idxs = [0] + idxs 306 if idxs[-1] != len(document.children): 307 idxs += [len(document.children)] 308 309 cleaner = str.maketrans(" /()-,:", "_______") 310 311 chapters = [] 312 for idx0, idx1 in zip(idxs, idxs[1:]): 313 # Find the chapter name 314 heading = document.children[idx0].children[0] 315 lines = anytree.search.findall(heading, filter_=lambda n: n.name == "line") 316 chapter_name = ("".join(c.char for c in line.obj.chars).strip() for line in lines) 317 chapter_name = " ".join(chapter_name) 318 if heading.name == "head1": 319 chapter_name = "0 " + chapter_name 320 filename = chapter_name.lower().translate(cleaner) 321 chapters.append( (chapter_name, filename, document.children[idx0:idx1 + 1]) ) 322 323 for title, filename, nodes in chapters: 324 chapter = anytree.Node("chapter", title=title, _filename=filename, parent=document) 325 for node in nodes: 326 node.parent = chapter 327 328 return document 329 330 331def normalize_document(document): 332 def _debug(func, indata, debug=0): 333 print(func.__name__[1:]) 334 if debug == -1: 335 print(RenderTree(indata)) 336 print() 337 outdata = func(indata) 338 if debug == 1: 339 print(RenderTree(outdata)) 340 print() 341 return outdata 342 343 document = _debug(_normalize_lines, document) 344 document = _debug(_normalize_captions, document) 345 document = _debug(_normalize_lists, document) 346 document = _debug(_normalize_paragraphs, document) 347 document = _debug(_normalize_headings, document) 348 document = _debug(_normalize_registers, document) 349 document = _debug(_normalize_tables, document) 350 # document = _debug(_normalize_chapters, document) 351 return document 352 353 354def _format_html_figure(xmlnode, figurenode): 355 tnode = etree.Element("table") 356 tnode.set("width", f"{int(figurenode._width * 50)}%") 357 xmlnode.append(tnode) 358 359 captionnode = next((c for c in figurenode.children if c.name == "caption"), None) 360 if captionnode is not None: 361 tnode.set("id", f"figure{captionnode.number}") 362 caption = etree.Element("caption") 363 tnode.append(caption) 364 _format_html(caption, captionnode, with_newlines=True) 365 366 ynode = etree.Element("tr") 367 tnode.append(ynode) 368 369 xynode = etree.Element("td") 370 ynode.append(xynode) 371 xynode.text = "(omitted)" 372 373 374def _format_html_table(xmlnode, tablenode): 375 tnode = etree.Element("table") 376 xmlnode.append(tnode) 377 # Format the caption 378 captionnode = next((c for c in tablenode.children if c.name == "caption"), None) 379 if captionnode is not None: 380 tnode.set("id", f"table{captionnode.number}") 381 caption = etree.Element("caption") 382 tnode.append(caption) 383 _format_html(caption, captionnode, with_newlines=True) 384 if tablenode.obj._type == "register": 385 tnode.set("class", "rt") 386 if tablenode.obj._type == "bitfield": 387 tnode.set("class", "bt") 388 389 # Cells are ordered (y, x) positions 390 ypos = -1 391 ynode = None 392 header_rows = tablenode.obj.header_rows 393 for cell in tablenode.obj.cells: 394 # Add another row to the table 395 if ypos != cell.y or ynode is None: 396 ypos = cell.y 397 ynode = etree.Element("tr") 398 tnode.append(ynode) 399 400 # Add the right cell with spans and style 401 xynodespan = xynode = etree.Element("th" if cell.is_header else "td") 402 ynode.append(xynode) 403 if cell.xspan > 1: 404 xynode.set("colspan", str(cell.xspan)) 405 if cell.yspan > 1: 406 xynode.set("rowspan", str(cell.yspan)) 407 if not cell.rotation and tablenode.obj._type != "register" and cell.left_aligned: 408 xynode.set("class", "tl") 409 if cell.rotation: 410 xynodespan = etree.Element("span") 411 xynodespan.set("class", "tv") 412 xynode.append(xynodespan) 413 if (cell.y + cell.yspan) == header_rows: 414 if cl := xynode.get("class"): 415 xynode.set("class", "thb " + cl) 416 else: 417 xynode.set("class", "thb") 418 419 if cell._is_simple: 420 xynodespan.text = cell.content.strip() 421 else: 422 cell_doc = anytree.Node("document", _page=cell.ast.page) 423 cell.ast.parent = cell_doc 424 cell_doc = _normalize_lines(cell_doc) 425 cell_doc = _normalize_lists(cell_doc) 426 cell_doc = _normalize_paragraphs(cell_doc) 427 # print(RenderTree(cell_doc)) 428 _format_html(xynodespan, cell_doc, with_newlines=True, 429 ignore_formatting={"bold"} if cell.is_header else None) 430 431 432def _format_char(node, state, chars, ignore): 433 NOFMT = { 434 "superscript": False, 435 "subscript": False, 436 "italic": False, 437 "bold": False, 438 "underline": False, 439 } 440 if state is None: state = NOFMT 441 char = chars[0] 442 if char["char"] in {'\r'}: 443 return (True, node, state) 444 445 # print(node, state, char["char"]) 446 diffs = {} 447 for key in NOFMT: 448 if state[key] != char[key] and key not in ignore: 449 diffs[key] = char[key] 450 # if diffs: print(diffs) 451 if not diffs: 452 prev_name = node.children[-1].name if node.children else None 453 # print(node) 454 if prev_name != "newline" and char["char"] == '\n': 455 # if not (prev_name == "chars" and node.children[-1].chars[-1] == " "): 456 anytree.Node("newline", parent=node) 457 elif prev_name != "chars": 458 anytree.Node("chars", parent=node, chars=char["char"]) 459 else: 460 node.children[-1].chars += char["char"] 461 return (True, node, state) 462 else: 463 disable = [key for key, value in diffs.items() if not value] 464 if disable: 465 state[node.name] = False 466 return (False, node.parent, state) 467 else: 468 enable = [key for key, value in diffs.items() if value][0] 469 fmtnode = anytree.Node(enable, parent=node) 470 state[enable] = True 471 return (False, fmtnode, state) 472 473 474def _format_lines(textnode, ignore, with_newlines, with_start): 475 char_props = textnode.root._page._char_properties 476 formatn = anytree.Node("format") 477 chars = [] 478 for line in textnode.children: 479 if line.name == "line": 480 for char in line.obj.chars[0 if with_start else line.start:]: 481 if not with_newlines and char.unicode in {0xa, 0xd}: 482 continue 483 chars.append(char_props(line.obj, char)) 484 if with_newlines and chars[-1]["char"] not in {'\n'}: 485 char = char_props(line.obj, line.obj.chars[-1]) 486 char["char"] = '\n' 487 chars.append(char) 488 489 chars = list_strip(chars, lambda c: c["char"] in {' ', '\n'}) 490 state = None 491 node = formatn 492 while chars: 493 popchar, node, state = _format_char(node, state, chars, ignore) 494 if popchar: chars.pop(0) 495 return formatn 496 497 498def _format_html_fmt(xmlnode, treenode, tail=False): 499 CONV = { 500 "superscript": "sup", 501 "subscript": "sub", 502 "italic": "i", 503 "bold": "b", 504 "underline": "u", 505 "newline": "br", 506 } 507 # print(xmlnode, treenode) 508 if treenode.name == "chars": 509 # print(f"{'tail' if tail else 'text'} char={treenode.chars}") 510 if tail: 511 xmlnode.tail = (xmlnode.tail or "") + treenode.chars 512 else: 513 xmlnode.text = (xmlnode.text or "") + treenode.chars 514 return (tail, xmlnode) 515 else: 516 # print(f"sub {treenode.name}") 517 if tail: xmlnode = xmlnode.getparent() 518 subnode = etree.SubElement(xmlnode, CONV[treenode.name]) 519 tail = False 520 iternode = subnode 521 for child in treenode.children: 522 tail, iternode = _format_html_fmt(iternode, child, tail) 523 return (True, subnode) 524 525 526def _format_html_text(xmlnode, treenode, ignore=None, with_newlines=False, with_start=True): 527 fmttree = _format_lines(treenode, ignore or set(), with_newlines, with_start) 528 tail = False 529 fmtnode = xmlnode 530 for child in fmttree.children: 531 tail, fmtnode = _format_html_fmt(fmtnode, child, tail) 532 533 # print(RenderTree(fmttree)) 534 # print(etree.tostring(xmlnode, pretty_print=True).decode("utf-8")) 535 536 537def _format_html(xmlnode, treenode, ignore_formatting=None, 538 with_newlines=False, with_start=True): 539 if ignore_formatting is None: 540 ignore_formatting = set() 541 # print(xmlnode, treenode.name) 542 current = xmlnode 543 if treenode.name.startswith("head"): 544 current = etree.Element(f"h{treenode.name[4]}") 545 if treenode.marker: 546 current.set("id", f"section{treenode.marker}") 547 xmlnode.append(current) 548 ignore_formatting = ignore_formatting | {"bold", "italic", "underline"} 549 550 elif treenode.name in {"para"}: 551 current = etree.Element("p") 552 xmlnode.append(current) 553 554 elif treenode.name in {"note"}: 555 current = etree.Element("div") 556 current.set("class", "nt") 557 xmlnode.append(current) 558 559 elif treenode.name == "text": 560 _format_html_text(xmlnode, treenode, ignore_formatting, with_newlines, with_start) 561 562 elif treenode.name == "page": 563 if not current.get("id"): 564 current.set("id", f"page{treenode.number}") 565 print(f"{treenode.number}.", end="", flush=True) 566 return 567 568 elif treenode.name == "table": 569 _format_html_table(xmlnode, treenode) 570 return 571 572 elif treenode.name == "figure": 573 _format_html_figure(xmlnode, treenode) 574 return 575 576 elif treenode.name == "bits": 577 _format_html_bits(xmlnode, treenode) 578 return 579 580 elif treenode.name.startswith("list"): 581 if treenode.name[4] in {"b", "s"}: 582 current = etree.Element("ul") 583 else: 584 current = etree.Element("ol") 585 xmlnode.append(current) 586 587 elif treenode.name == "element": 588 current = etree.Element("li") 589 if xmlnode.tag == "ol": 590 current.set("value", str(treenode.value)) 591 xmlnode.append(current) 592 with_start = False 593 594 for child in treenode.children: 595 _format_html(current, child, ignore_formatting, with_newlines, with_start) 596 597 598def format_document(document): 599 html = etree.Element("html") 600 601 head = etree.Element("head") 602 html.append(head) 603 604 link = etree.Element("link") 605 link.set("rel", "stylesheet") 606 link.set("href", "../style.css") 607 head.append(link) 608 609 body = etree.Element("body") 610 html.append(body) 611 612 _format_html(body, document, with_newlines=True) 613 614 html = etree.ElementTree(html) 615 return html 616 617 618def write_html(html, path, pretty=True): 619 with open(path, "wb") as f: 620 html.write(f, pretty_print=pretty, doctype="<!DOCTYPE html>")
LOGGER =
<Logger modm_data.pdf2html.stmicro.ast (WARNING)>
def
merge_area(document, area, debug=False):
28def merge_area(document, area, debug=False): 29 if document is None: 30 document = anytree.Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None) 31 document._end = document 32 if not area.children: 33 return document 34 if debug: print() 35 36 def _find_end(node): 37 # Find the last leaf node but skip lines, paragraphs, captions/tables/figures 38 return next((c for c in ReversePreOrderIter(node) 39 if any(c.name.startswith(name) for name in {"head", "list", "note"})), 40 next(ReversePreOrderIter(node), node)) 41 def _find_ancestor(filter_): 42 if filter_(document._end): return document._end 43 return next((c for c in document._end.iter_path_reverse() 44 if filter_(c)), document.root) 45 46 area = _normalize_area(area) 47 if debug: print(RenderTree(area)) 48 children = area.children 49 # All area nodes up to the next top-level element must now be 50 # xpos-aligned with the previous area's last leaf node 51 connect_index = next((ii for ii, c in enumerate(children) 52 if c.name.startswith("head")), len(children)) 53 x_em = area.page._spacing["x_em"] 54 55 if debug: print("area=", area, "connect_index=", connect_index) 56 # Align these children with the last leaf node xpos 57 for child in children[:connect_index]: 58 if any(child.name.startswith(name) for name in {"list"}): 59 # Find the node that is left of the current node but not too far left 60 host = _find_ancestor(lambda c: -4 * x_em < (c.xpos - child.xpos) < -x_em or 61 c.name.startswith("head")) 62 elif (child.name == "para" and document._end.name == "note" and 63 child.children[0].obj.contains_font("Italic", "Oblique")): 64 host = document._end 65 else: 66 # Insert underneath the next heading 67 host = _find_ancestor(lambda c: c.name.startswith("head")) 68 69 child.parent = host 70 document._end = _find_end(document) 71 if debug: 72 print("child=", child) 73 print("host=", host) 74 print("end=", document._end) 75 print() 76 77 # Add the remaining top-level children to connect index node 78 if connect_index < len(children): 79 children[connect_index].parent = document 80 for child in children[connect_index + 1:]: 81 child.parent = children[connect_index] 82 83 document._end = _find_end(document) 84 85 if debug: 86 print() 87 print() 88 89 return document
def
normalize_document(document):
332def normalize_document(document): 333 def _debug(func, indata, debug=0): 334 print(func.__name__[1:]) 335 if debug == -1: 336 print(RenderTree(indata)) 337 print() 338 outdata = func(indata) 339 if debug == 1: 340 print(RenderTree(outdata)) 341 print() 342 return outdata 343 344 document = _debug(_normalize_lines, document) 345 document = _debug(_normalize_captions, document) 346 document = _debug(_normalize_lists, document) 347 document = _debug(_normalize_paragraphs, document) 348 document = _debug(_normalize_headings, document) 349 document = _debug(_normalize_registers, document) 350 document = _debug(_normalize_tables, document) 351 # document = _debug(_normalize_chapters, document) 352 return document
def
format_document(document):
599def format_document(document): 600 html = etree.Element("html") 601 602 head = etree.Element("head") 603 html.append(head) 604 605 link = etree.Element("link") 606 link.set("rel", "stylesheet") 607 link.set("href", "../style.css") 608 head.append(link) 609 610 body = etree.Element("body") 611 html.append(body) 612 613 _format_html(body, document, with_newlines=True) 614 615 html = etree.ElementTree(html) 616 return html
def
write_html(html, path, pretty=True):