modm_data.html
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4from . import stmicro 5from .document import Document 6from .chapter import Chapter 7from .table import Table 8from .text import Text, Heading, replace, listify 9from .list import List 10 11__all__ = [ 12 "stmicro", 13 "Document", 14 "Chapter", 15 "Table", 16 "Text", 17 "Heading", 18 "List", 19 "replace", 20 "listify", 21]
class
Document:
15class Document: 16 def __init__(self, path: str): 17 self.path = Path(path) 18 self.relpath = os.path.relpath(self.path, Path().cwd()) 19 self.fullname = self.path.stem 20 self.name = self.fullname.split("-")[0] 21 self.version = self.fullname.split("-")[1] 22 23 @cached_property 24 def _chapters(self) -> dict[str, Chapter]: 25 chapters = {} 26 for path in self.path.glob("*.html"): 27 chapters[path.stem.replace("_", " ")] = Chapter(path) 28 return chapters 29 30 @cached_property 31 def path_pdf(self) -> str: 32 return Path(str(self.path).replace("/html/", "/pdf/") + ".pdf") 33 34 def chapters(self, pattern: str = None) -> list[Chapter]: 35 if pattern is None: 36 return list(self._chapters.values()) 37 return [c for name, c in self._chapters.items() if re.search(pattern, name, re.IGNORECASE)] 38 39 def chapter(self, pattern: str) -> Chapter: 40 chapters = self.chapters(pattern) 41 if len(chapters) == 0: 42 LOGGER.error(f"Cannot find chapter with pattern '{pattern}'!") 43 if len(chapters) > 1: 44 LOGGER.error(f"Found multiple chapters with pattern '{pattern}'!") 45 for chapter in chapters: 46 LOGGER.error(f" - {chapter.name}") 47 assert len(chapters) == 1 48 return chapters[0] 49 50 def __repr__(self) -> str: 51 return f"Doc({self.fullname})"
39 def chapter(self, pattern: str) -> Chapter: 40 chapters = self.chapters(pattern) 41 if len(chapters) == 0: 42 LOGGER.error(f"Cannot find chapter with pattern '{pattern}'!") 43 if len(chapters) > 1: 44 LOGGER.error(f"Found multiple chapters with pattern '{pattern}'!") 45 for chapter in chapters: 46 LOGGER.error(f" - {chapter.name}") 47 assert len(chapters) == 1 48 return chapters[0]
class
Chapter:
16class Chapter: 17 def __init__(self, path: str): 18 self._path = Path(path) 19 20 @cached_property 21 def _parser(self): 22 parser = Parser() 23 parser.feed(self._path.read_text()) 24 return parser 25 26 @property 27 def _relpath(self) -> str: 28 return self._path.relative_to(Path().cwd()) 29 30 @property 31 def name(self) -> str: 32 return self._path.stem.replace("_", " ") 33 34 @property 35 def number(self) -> int: 36 return int(self._path.stem.split("_")[1]) 37 38 @property 39 def items(self) -> list: 40 return self._parser._items 41 42 def headings(self) -> list[str]: 43 return [h for h in self.items if isinstance(h, Heading)] 44 45 def texts(self) -> list[str]: 46 return [t for t in self.items if isinstance(t, Text)] 47 48 def _heading_objects(self, obj_type, pattern, **subs) -> list: 49 heading_texts = [] 50 current = [None, []] 51 for item in self.items: 52 if isinstance(item, Heading): 53 if current[1]: 54 heading_texts.append(tuple(current)) 55 current = [item, []] 56 elif isinstance(item, obj_type): 57 current[1].append(item) 58 if current[1]: 59 heading_texts.append(tuple(current)) 60 if pattern is None: 61 return heading_texts 62 return [ 63 ht 64 for ht in heading_texts 65 if re.search(pattern, ht[0].text(**subs) if ht[0] is not None else "", re.IGNORECASE) 66 ] 67 68 def heading_texts(self, pattern=None, **subs) -> list: 69 return self._heading_objects(Text, pattern, **subs) 70 71 def heading_tables(self, pattern=None, **subs) -> list: 72 return self._heading_objects(Table, pattern, **subs) 73 74 def tables(self, pattern: str = None, **subs) -> list[Table]: 75 tables = [t for t in self.items if isinstance(t, Table)] 76 if pattern is None: 77 return tables 78 return [t for t in tables if re.search(pattern, t.caption(**subs), re.IGNORECASE)] 79 80 def table(self, pattern: str) -> Table: 81 tables = self.tables(pattern) 82 if len(tables) == 0: 83 LOGGER.error(f"Cannot find table with pattern '{pattern}'!") 84 if len(tables) > 1: 85 LOGGER.error(f"Found multiple tables with pattern '{pattern}'!") 86 assert len(tables) == 1 87 return tables[0] 88 89 def __hash__(self) -> int: 90 return hash(self._path.stem) 91 92 def __eq__(self) -> int: 93 return hash(self._path.stem) 94 95 def __repr__(self) -> str: 96 return f"Chapter({self.name})"
80 def table(self, pattern: str) -> Table: 81 tables = self.tables(pattern) 82 if len(tables) == 0: 83 LOGGER.error(f"Cannot find table with pattern '{pattern}'!") 84 if len(tables) > 1: 85 LOGGER.error(f"Found multiple tables with pattern '{pattern}'!") 86 assert len(tables) == 1 87 return tables[0]
class
Table:
78class Table: 79 def __init__(self, heading=None): 80 self._heading = heading or Heading("") 81 self._cells = [] 82 self._size = (0, 0) 83 self._grid = None 84 self._hrows = 0 85 self._caption = Text("") 86 87 def __repr__(self) -> str: 88 return f"Table({self.columns}×{self.rows})" 89 90 def heading(self, **filters): 91 return self._heading.text(**filters) 92 93 def caption(self, **filters): 94 return self._caption.text(**filters) 95 96 def _domains_x(self, **subs) -> dict[str, list[int]]: 97 domains = defaultdict(list) 98 for x in range(self.columns): 99 cell = None 100 domain = [] 101 for y in range(self._hrows): 102 if (ncell := self.cell(x, y)) != cell: 103 cell = ncell 104 domain.append(cell) 105 domain = ":".join(cell.text(**subs).replace(":", "") for cell in domain) 106 domains[domain].append(x) 107 return dict(domains) 108 109 def _domains_y(self, columns: list[int], **subs) -> dict[str, list[int]]: 110 domains = defaultdict(list) 111 for y in range(self._hrows, self.rows): 112 cell = None 113 cells = [] 114 for x in columns: 115 if (ncell := self.cell(x, y)) != cell: 116 cell = ncell 117 cells.append(cell) 118 if cells: 119 domain = ":".join(cell.text(**subs).replace(":", "") for cell in cells) 120 domains[domain].append(y) 121 return dict(domains) 122 123 def domains_x(self, pattern=None, **subs) -> list[str]: 124 domains = sorted(self._domains_x(**subs).keys()) 125 if pattern is not None: 126 domains = [d for d in domains if re.search(pattern, d, re.IGNORECASE)] 127 return domains 128 129 def domains(self, pattern: str, **subs) -> Domains: 130 domains = [] 131 columns = [] 132 for domain, cols in self._domains_x(**subs).items(): 133 if re.search(pattern, domain, re.IGNORECASE): 134 domains.append(domain) 135 columns.extend(cols) 136 return Domains(self, domains, columns, pattern) 137 138 def cell_rows(self, pattern: str = None, **subs) -> dict[str, list[Cell]]: 139 columns = [] 140 for domain, cols in self._domains_x(**subs).items(): 141 if pattern is None or re.search(pattern, domain, re.IGNORECASE): 142 columns.append((domain, cols)) 143 for y in range(self._hrows, self.rows): 144 values = defaultdict(list) 145 for domain, cols in columns: 146 values[domain].extend(self.cell(c, y) for c in cols) 147 yield ReDict(values) 148 149 def cell(self, x: int, y: int) -> Cell: 150 assert x < self.columns 151 assert y < self.rows 152 return self._grid[y][x] 153 154 @property 155 def columns(self) -> int: 156 return self._size[0] 157 158 @property 159 def rows(self) -> int: 160 return self._size[1] 161 162 @property 163 def size(self) -> tuple[int, int]: 164 return self._size 165 166 def _normalize(self): 167 xsize = sum(c._span[0] for c in self._cells if c._pos[0][1] == 0) 168 ysize = max(c._pos[0][1] + c._span[1] for c in self._cells) 169 self._size = (xsize, ysize) 170 self._grid = [[None for _ in range(xsize)] for _ in range(ysize)] 171 172 xpos, ypos = 0, 0 173 for cell in self._cells: 174 # print(cell._pos, cell._span, cell._data) 175 ypos = cell._pos[0][1] 176 if cell._head: 177 self._hrows = ypos + 1 178 cell._pos = [] 179 # Previous cells with multiple rows may already have taken our current xpos 180 # We must find the next cell that's still empty 181 xpos = next((x for x, c in enumerate(self._grid[ypos]) if c is None), xpos) 182 for y in range(ypos, min(ypos + cell._span[1], ysize)): 183 for x in range(xpos, min(xpos + cell._span[0], xsize)): 184 self._grid[y][x] = cell 185 cell._pos.append((x, y)) 186 xpos += cell._span[0] 187 188 def render(self): 189 for y in range(self.rows): 190 for x in range(self.columns): 191 print(self.cell(x, y).text()[:15] if self.cell(x, y) is not None else "None", end="\t") 192 print()
def
domains(self, pattern: str, **subs) -> modm_data.html.table.Domains:
129 def domains(self, pattern: str, **subs) -> Domains: 130 domains = [] 131 columns = [] 132 for domain, cols in self._domains_x(**subs).items(): 133 if re.search(pattern, domain, re.IGNORECASE): 134 domains.append(domain) 135 columns.extend(cols) 136 return Domains(self, domains, columns, pattern)
def
cell_rows( self, pattern: str = None, **subs) -> dict[str, list[modm_data.html.table.Cell]]:
138 def cell_rows(self, pattern: str = None, **subs) -> dict[str, list[Cell]]: 139 columns = [] 140 for domain, cols in self._domains_x(**subs).items(): 141 if pattern is None or re.search(pattern, domain, re.IGNORECASE): 142 columns.append((domain, cols)) 143 for y in range(self._hrows, self.rows): 144 values = defaultdict(list) 145 for domain, cols in columns: 146 values[domain].extend(self.cell(c, y) for c in cols) 147 yield ReDict(values)
class
Text:
def
replace(html, **substitutions) -> str:
8def replace(html, **substitutions) -> str: 9 subs = {"u": "*", "i": "*", "b": "*", "sub": "*", "sup": "*", "br": "*", "p": "*"} 10 subs.update(substitutions) 11 for tag, replacement in subs.items(): 12 if tag in {"u", "i", "b", "p", "br", "sup", "sub"}: 13 if replacement == "*": 14 try: 15 html = re.sub(f"</?{tag}>", "", html) 16 except: 17 print(html) 18 raise 19 else: 20 html = re.sub(f"<{tag}>(.*?)</{tag}>", replacement, html) 21 else: 22 html = re.sub(tag, replacement, html) 23 return html
def
listify(text, pattern=None, strip=True) -> list[str]: