modm_data.html

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4from . import stmicro
 5from .document import Document
 6from .chapter import Chapter
 7from .table import Table
 8from .text import Text, Heading, replace, listify
 9from .list import List
10
11__all__ = [
12    "stmicro",
13    "Document",
14    "Chapter",
15    "Table",
16    "Text",
17    "Heading",
18    "List",
19    "replace",
20    "listify",
21]
class Document:
15class Document:
16    def __init__(self, path: str):
17        self.path = Path(path)
18        self.relpath = os.path.relpath(self.path, Path().cwd())
19        self.fullname = self.path.stem
20        self.name = self.fullname.split("-")[0]
21        self.version = self.fullname.split("-")[1]
22
23    @cached_property
24    def _chapters(self) -> dict[str, Chapter]:
25        chapters = {}
26        for path in self.path.glob("*.html"):
27            chapters[path.stem.replace("_", " ")] = Chapter(path)
28        return chapters
29
30    @cached_property
31    def path_pdf(self) -> str:
32        return Path(str(self.path).replace("/html/", "/pdf/") + ".pdf")
33
34    def chapters(self, pattern: str = None) -> list[Chapter]:
35        if pattern is None:
36            return list(self._chapters.values())
37        return [c for name, c in self._chapters.items() if re.search(pattern, name, re.IGNORECASE)]
38
39    def chapter(self, pattern: str) -> Chapter:
40        chapters = self.chapters(pattern)
41        if len(chapters) == 0:
42            LOGGER.error(f"Cannot find chapter with pattern '{pattern}'!")
43        if len(chapters) > 1:
44            LOGGER.error(f"Found multiple chapters with pattern '{pattern}'!")
45            for chapter in chapters:
46                LOGGER.error(f"  - {chapter.name}")
47        assert len(chapters) == 1
48        return chapters[0]
49
50    def __repr__(self) -> str:
51        return f"Doc({self.fullname})"
Document(path: str)
16    def __init__(self, path: str):
17        self.path = Path(path)
18        self.relpath = os.path.relpath(self.path, Path().cwd())
19        self.fullname = self.path.stem
20        self.name = self.fullname.split("-")[0]
21        self.version = self.fullname.split("-")[1]
path
relpath
fullname
name
version
path_pdf: str
30    @cached_property
31    def path_pdf(self) -> str:
32        return Path(str(self.path).replace("/html/", "/pdf/") + ".pdf")
def chapters(self, pattern: str = None) -> list[Chapter]:
34    def chapters(self, pattern: str = None) -> list[Chapter]:
35        if pattern is None:
36            return list(self._chapters.values())
37        return [c for name, c in self._chapters.items() if re.search(pattern, name, re.IGNORECASE)]
def chapter(self, pattern: str) -> Chapter:
39    def chapter(self, pattern: str) -> Chapter:
40        chapters = self.chapters(pattern)
41        if len(chapters) == 0:
42            LOGGER.error(f"Cannot find chapter with pattern '{pattern}'!")
43        if len(chapters) > 1:
44            LOGGER.error(f"Found multiple chapters with pattern '{pattern}'!")
45            for chapter in chapters:
46                LOGGER.error(f"  - {chapter.name}")
47        assert len(chapters) == 1
48        return chapters[0]
class Chapter:
16class Chapter:
17    def __init__(self, path: str):
18        self._path = Path(path)
19
20    @cached_property
21    def _parser(self):
22        parser = Parser()
23        parser.feed(self._path.read_text())
24        return parser
25
26    @property
27    def _relpath(self) -> str:
28        return self._path.relative_to(Path().cwd())
29
30    @property
31    def name(self) -> str:
32        return self._path.stem.replace("_", " ")
33
34    @property
35    def number(self) -> int:
36        return int(self._path.stem.split("_")[1])
37
38    @property
39    def items(self) -> list:
40        return self._parser._items
41
42    def headings(self) -> list[str]:
43        return [h for h in self.items if isinstance(h, Heading)]
44
45    def texts(self) -> list[str]:
46        return [t for t in self.items if isinstance(t, Text)]
47
48    def _heading_objects(self, obj_type, pattern, **subs) -> list:
49        heading_texts = []
50        current = [None, []]
51        for item in self.items:
52            if isinstance(item, Heading):
53                if current[1]:
54                    heading_texts.append(tuple(current))
55                current = [item, []]
56            elif isinstance(item, obj_type):
57                current[1].append(item)
58        if current[1]:
59            heading_texts.append(tuple(current))
60        if pattern is None:
61            return heading_texts
62        return [
63            ht
64            for ht in heading_texts
65            if re.search(pattern, ht[0].text(**subs) if ht[0] is not None else "", re.IGNORECASE)
66        ]
67
68    def heading_texts(self, pattern=None, **subs) -> list:
69        return self._heading_objects(Text, pattern, **subs)
70
71    def heading_tables(self, pattern=None, **subs) -> list:
72        return self._heading_objects(Table, pattern, **subs)
73
74    def tables(self, pattern: str = None, **subs) -> list[Table]:
75        tables = [t for t in self.items if isinstance(t, Table)]
76        if pattern is None:
77            return tables
78        return [t for t in tables if re.search(pattern, t.caption(**subs), re.IGNORECASE)]
79
80    def table(self, pattern: str) -> Table:
81        tables = self.tables(pattern)
82        if len(tables) == 0:
83            LOGGER.error(f"Cannot find table with pattern '{pattern}'!")
84        if len(tables) > 1:
85            LOGGER.error(f"Found multiple tables with pattern '{pattern}'!")
86        assert len(tables) == 1
87        return tables[0]
88
89    def __hash__(self) -> int:
90        return hash(self._path.stem)
91
92    def __eq__(self) -> int:
93        return hash(self._path.stem)
94
95    def __repr__(self) -> str:
96        return f"Chapter({self.name})"
Chapter(path: str)
17    def __init__(self, path: str):
18        self._path = Path(path)
name: str
30    @property
31    def name(self) -> str:
32        return self._path.stem.replace("_", " ")
number: int
34    @property
35    def number(self) -> int:
36        return int(self._path.stem.split("_")[1])
items: list
38    @property
39    def items(self) -> list:
40        return self._parser._items
def headings(self) -> list[str]:
42    def headings(self) -> list[str]:
43        return [h for h in self.items if isinstance(h, Heading)]
def texts(self) -> list[str]:
45    def texts(self) -> list[str]:
46        return [t for t in self.items if isinstance(t, Text)]
def heading_texts(self, pattern=None, **subs) -> list:
68    def heading_texts(self, pattern=None, **subs) -> list:
69        return self._heading_objects(Text, pattern, **subs)
def heading_tables(self, pattern=None, **subs) -> list:
71    def heading_tables(self, pattern=None, **subs) -> list:
72        return self._heading_objects(Table, pattern, **subs)
def tables(self, pattern: str = None, **subs) -> list[Table]:
74    def tables(self, pattern: str = None, **subs) -> list[Table]:
75        tables = [t for t in self.items if isinstance(t, Table)]
76        if pattern is None:
77            return tables
78        return [t for t in tables if re.search(pattern, t.caption(**subs), re.IGNORECASE)]
def table(self, pattern: str) -> Table:
80    def table(self, pattern: str) -> Table:
81        tables = self.tables(pattern)
82        if len(tables) == 0:
83            LOGGER.error(f"Cannot find table with pattern '{pattern}'!")
84        if len(tables) > 1:
85            LOGGER.error(f"Found multiple tables with pattern '{pattern}'!")
86        assert len(tables) == 1
87        return tables[0]
class Table:
 78class Table:
 79    def __init__(self, heading=None):
 80        self._heading = heading or Heading("")
 81        self._cells = []
 82        self._size = (0, 0)
 83        self._grid = None
 84        self._hrows = 0
 85        self._caption = Text("")
 86
 87    def __repr__(self) -> str:
 88        return f"Table({self.columns}×{self.rows})"
 89
 90    def heading(self, **filters):
 91        return self._heading.text(**filters)
 92
 93    def caption(self, **filters):
 94        return self._caption.text(**filters)
 95
 96    def _domains_x(self, **subs) -> dict[str, list[int]]:
 97        domains = defaultdict(list)
 98        for x in range(self.columns):
 99            cell = None
100            domain = []
101            for y in range(self._hrows):
102                if (ncell := self.cell(x, y)) != cell:
103                    cell = ncell
104                    domain.append(cell)
105            domain = ":".join(cell.text(**subs).replace(":", "") for cell in domain)
106            domains[domain].append(x)
107        return dict(domains)
108
109    def _domains_y(self, columns: list[int], **subs) -> dict[str, list[int]]:
110        domains = defaultdict(list)
111        for y in range(self._hrows, self.rows):
112            cell = None
113            cells = []
114            for x in columns:
115                if (ncell := self.cell(x, y)) != cell:
116                    cell = ncell
117                    cells.append(cell)
118            if cells:
119                domain = ":".join(cell.text(**subs).replace(":", "") for cell in cells)
120                domains[domain].append(y)
121        return dict(domains)
122
123    def domains_x(self, pattern=None, **subs) -> list[str]:
124        domains = sorted(self._domains_x(**subs).keys())
125        if pattern is not None:
126            domains = [d for d in domains if re.search(pattern, d, re.IGNORECASE)]
127        return domains
128
129    def domains(self, pattern: str, **subs) -> Domains:
130        domains = []
131        columns = []
132        for domain, cols in self._domains_x(**subs).items():
133            if re.search(pattern, domain, re.IGNORECASE):
134                domains.append(domain)
135                columns.extend(cols)
136        return Domains(self, domains, columns, pattern)
137
138    def cell_rows(self, pattern: str = None, **subs) -> dict[str, list[Cell]]:
139        columns = []
140        for domain, cols in self._domains_x(**subs).items():
141            if pattern is None or re.search(pattern, domain, re.IGNORECASE):
142                columns.append((domain, cols))
143        for y in range(self._hrows, self.rows):
144            values = defaultdict(list)
145            for domain, cols in columns:
146                values[domain].extend(self.cell(c, y) for c in cols)
147            yield ReDict(values)
148
149    def cell(self, x: int, y: int) -> Cell:
150        assert x < self.columns
151        assert y < self.rows
152        return self._grid[y][x]
153
154    @property
155    def columns(self) -> int:
156        return self._size[0]
157
158    @property
159    def rows(self) -> int:
160        return self._size[1]
161
162    @property
163    def size(self) -> tuple[int, int]:
164        return self._size
165
166    def _normalize(self):
167        xsize = sum(c._span[0] for c in self._cells if c._pos[0][1] == 0)
168        ysize = max(c._pos[0][1] + c._span[1] for c in self._cells)
169        self._size = (xsize, ysize)
170        self._grid = [[None for _ in range(xsize)] for _ in range(ysize)]
171
172        xpos, ypos = 0, 0
173        for cell in self._cells:
174            # print(cell._pos, cell._span, cell._data)
175            ypos = cell._pos[0][1]
176            if cell._head:
177                self._hrows = ypos + 1
178            cell._pos = []
179            # Previous cells with multiple rows may already have taken our current xpos
180            # We must find the next cell that's still empty
181            xpos = next((x for x, c in enumerate(self._grid[ypos]) if c is None), xpos)
182            for y in range(ypos, min(ypos + cell._span[1], ysize)):
183                for x in range(xpos, min(xpos + cell._span[0], xsize)):
184                    self._grid[y][x] = cell
185                    cell._pos.append((x, y))
186            xpos += cell._span[0]
187
188    def render(self):
189        for y in range(self.rows):
190            for x in range(self.columns):
191                print(self.cell(x, y).text()[:15] if self.cell(x, y) is not None else "None", end="\t")
192            print()
Table(heading=None)
79    def __init__(self, heading=None):
80        self._heading = heading or Heading("")
81        self._cells = []
82        self._size = (0, 0)
83        self._grid = None
84        self._hrows = 0
85        self._caption = Text("")
def heading(self, **filters):
90    def heading(self, **filters):
91        return self._heading.text(**filters)
def caption(self, **filters):
93    def caption(self, **filters):
94        return self._caption.text(**filters)
def domains_x(self, pattern=None, **subs) -> list[str]:
123    def domains_x(self, pattern=None, **subs) -> list[str]:
124        domains = sorted(self._domains_x(**subs).keys())
125        if pattern is not None:
126            domains = [d for d in domains if re.search(pattern, d, re.IGNORECASE)]
127        return domains
def domains(self, pattern: str, **subs) -> modm_data.html.table.Domains:
129    def domains(self, pattern: str, **subs) -> Domains:
130        domains = []
131        columns = []
132        for domain, cols in self._domains_x(**subs).items():
133            if re.search(pattern, domain, re.IGNORECASE):
134                domains.append(domain)
135                columns.extend(cols)
136        return Domains(self, domains, columns, pattern)
def cell_rows( self, pattern: str = None, **subs) -> dict[str, list[modm_data.html.table.Cell]]:
138    def cell_rows(self, pattern: str = None, **subs) -> dict[str, list[Cell]]:
139        columns = []
140        for domain, cols in self._domains_x(**subs).items():
141            if pattern is None or re.search(pattern, domain, re.IGNORECASE):
142                columns.append((domain, cols))
143        for y in range(self._hrows, self.rows):
144            values = defaultdict(list)
145            for domain, cols in columns:
146                values[domain].extend(self.cell(c, y) for c in cols)
147            yield ReDict(values)
def cell(self, x: int, y: int) -> modm_data.html.table.Cell:
149    def cell(self, x: int, y: int) -> Cell:
150        assert x < self.columns
151        assert y < self.rows
152        return self._grid[y][x]
columns: int
154    @property
155    def columns(self) -> int:
156        return self._size[0]
rows: int
158    @property
159    def rows(self) -> int:
160        return self._size[1]
size: tuple[int, int]
162    @property
163    def size(self) -> tuple[int, int]:
164        return self._size
def render(self):
188    def render(self):
189        for y in range(self.rows):
190            for x in range(self.columns):
191                print(self.cell(x, y).text()[:15] if self.cell(x, y) is not None else "None", end="\t")
192            print()
class Text:
60class Text:
61    def __init__(self, html=None):
62        self.html = html or ""
63
64    def text(self, **filters) -> str:
65        return replace(self.html, **filters)
66
67    def __repr__(self) -> str:
68        return f"Text({self.html[:70]})"
Text(html=None)
61    def __init__(self, html=None):
62        self.html = html or ""
html
def text(self, **filters) -> str:
64    def text(self, **filters) -> str:
65        return replace(self.html, **filters)
class Heading(modm_data.html.Text):
71class Heading(Text):
72    def __repr__(self) -> str:
73        return f"Heading({self.html[:70]})"
Inherited Members
Text
Text
html
text
class List(modm_data.html.Text):
 8class List(Text):
 9    def __init__(self, html):
10        self._html = html
11
12    def __repr__(self) -> str:
13        return f"List({self.text()[:10]})"
List(html)
 9    def __init__(self, html):
10        self._html = html
Inherited Members
Text
html
text
def replace(html, **substitutions) -> str:
 8def replace(html, **substitutions) -> str:
 9    subs = {"u": "*", "i": "*", "b": "*", "sub": "*", "sup": "*", "br": "*", "p": "*"}
10    subs.update(substitutions)
11    for tag, replacement in subs.items():
12        if tag in {"u", "i", "b", "p", "br", "sup", "sub"}:
13            if replacement == "*":
14                try:
15                    html = re.sub(f"</?{tag}>", "", html)
16                except:
17                    print(html)
18                    raise
19            else:
20                html = re.sub(f"<{tag}>(.*?)</{tag}>", replacement, html)
21        else:
22            html = re.sub(tag, replacement, html)
23    return html
def listify(text, pattern=None, strip=True) -> list[str]:
26def listify(text, pattern=None, strip=True) -> list[str]:
27    if pattern is None:
28        pattern = " |,|/|<br>"
29    text = re.split(pattern, text)
30    if strip:
31        return [t.strip() for t in text if t.strip()]
32    else:
33        return [t for t in text if t]