modm_data.html.parser
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4import re 5import os.path 6import logging 7from functools import cached_property 8from html.parser import HTMLParser 9from .table import Table, Cell 10from .text import Text, Heading 11 12LOGGER = logging.getLogger(__name__) 13 14class Parser(HTMLParser): 15 def __init__(self): 16 super().__init__(convert_charrefs=True) 17 self._items = [] 18 self._ignore_tags = ["span"] 19 self._type = None 20 self._tags = [] 21 22 self._ty = -1 23 self._tx = -1 24 self._table = None 25 self._cell = None 26 self._data = "" 27 self._collect_data = False 28 29 def _clear_data(self): 30 self._collect_data = False 31 data = self._data.replace("\n", "").replace("\r", "") 32 data = data.strip() 33 return data 34 35 def handle_starttag(self, tag, attrs): 36 if self._collect_data and tag not in self._ignore_tags: 37 self._data += f"<{tag}>" 38 39 if tag in ["table", "th", "tr", "td", "caption"]: 40 self._data = "" 41 self._collect_data = True 42 if tag == "table": 43 heading = next((i for i in reversed(self._items) if isinstance(i, Heading)), None) 44 self._table = Table(heading=heading) 45 self._type = "t" 46 self._ty = -1 47 if self._table and tag == "tr": 48 self._ty += 1 49 self._tx = -1 50 if self._table and tag in ["th", "td"]: 51 self._tx += 1 52 tys = next((a[1] for a in attrs if a[0] == "rowspan"), 1) 53 txs = next((a[1] for a in attrs if a[0] == "colspan"), 1) 54 self._cell = Cell(self._tx, self._ty, int(txs), int(tys), tag == "th") 55 56 elif re.match(r"h[1-6]", tag): 57 self._data = "" 58 self._collect_data = True 59 self._type = "h" 60 61 elif self._type is None: 62 self._data = "" 63 self._collect_data = True 64 self._type = (tag, len(self._tags)) 65 66 self._tags.append(tag) 67 68 def handle_data(self, data): 69 if self._collect_data: 70 self._data += data 71 72 def handle_endtag(self, tag): 73 self._tags.pop() 74 75 if re.match(r"h[1-6]", tag): 76 self._items.append(Heading(self._clear_data())) 77 self._type = None 78 79 elif self._table and tag == "caption": 80 self._table._caption = Text(self._clear_data()) 81 82 elif self._cell and tag in ["th", "td"]: 83 self._cell.html = self._clear_data() 84 self._table._cells.append(self._cell) 85 self._cell = None 86 87 elif self._table and tag == "table": 88 if self._table._cells: 89 self._table._normalize() 90 if self._table.size > (1,1) or self._table.cell(0,0).html != "(omitted)": 91 self._items.append(self._table) 92 self._table = None 93 self._type = None 94 95 if self._type == (tag, len(self._tags)): 96 self._type = None 97 self._items.append(Text(self._clear_data())) 98 99 if self._collect_data and tag not in self._ignore_tags: 100 self._data += f"</{tag}>"
15class Parser(HTMLParser): 16 def __init__(self): 17 super().__init__(convert_charrefs=True) 18 self._items = [] 19 self._ignore_tags = ["span"] 20 self._type = None 21 self._tags = [] 22 23 self._ty = -1 24 self._tx = -1 25 self._table = None 26 self._cell = None 27 self._data = "" 28 self._collect_data = False 29 30 def _clear_data(self): 31 self._collect_data = False 32 data = self._data.replace("\n", "").replace("\r", "") 33 data = data.strip() 34 return data 35 36 def handle_starttag(self, tag, attrs): 37 if self._collect_data and tag not in self._ignore_tags: 38 self._data += f"<{tag}>" 39 40 if tag in ["table", "th", "tr", "td", "caption"]: 41 self._data = "" 42 self._collect_data = True 43 if tag == "table": 44 heading = next((i for i in reversed(self._items) if isinstance(i, Heading)), None) 45 self._table = Table(heading=heading) 46 self._type = "t" 47 self._ty = -1 48 if self._table and tag == "tr": 49 self._ty += 1 50 self._tx = -1 51 if self._table and tag in ["th", "td"]: 52 self._tx += 1 53 tys = next((a[1] for a in attrs if a[0] == "rowspan"), 1) 54 txs = next((a[1] for a in attrs if a[0] == "colspan"), 1) 55 self._cell = Cell(self._tx, self._ty, int(txs), int(tys), tag == "th") 56 57 elif re.match(r"h[1-6]", tag): 58 self._data = "" 59 self._collect_data = True 60 self._type = "h" 61 62 elif self._type is None: 63 self._data = "" 64 self._collect_data = True 65 self._type = (tag, len(self._tags)) 66 67 self._tags.append(tag) 68 69 def handle_data(self, data): 70 if self._collect_data: 71 self._data += data 72 73 def handle_endtag(self, tag): 74 self._tags.pop() 75 76 if re.match(r"h[1-6]", tag): 77 self._items.append(Heading(self._clear_data())) 78 self._type = None 79 80 elif self._table and tag == "caption": 81 self._table._caption = Text(self._clear_data()) 82 83 elif self._cell and tag in ["th", "td"]: 84 self._cell.html = self._clear_data() 85 self._table._cells.append(self._cell) 86 self._cell = None 87 88 elif self._table and tag == "table": 89 if self._table._cells: 90 self._table._normalize() 91 if self._table.size > (1,1) or self._table.cell(0,0).html != "(omitted)": 92 self._items.append(self._table) 93 self._table = None 94 self._type = None 95 96 if self._type == (tag, len(self._tags)): 97 self._type = None 98 self._items.append(Text(self._clear_data())) 99 100 if self._collect_data and tag not in self._ignore_tags: 101 self._data += f"</{tag}>"
Find tags and other markup and call handler functions.
Usage: p = HTMLParser() p.feed(data) ... p.close()
Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument.
16 def __init__(self): 17 super().__init__(convert_charrefs=True) 18 self._items = [] 19 self._ignore_tags = ["span"] 20 self._type = None 21 self._tags = [] 22 23 self._ty = -1 24 self._tx = -1 25 self._table = None 26 self._cell = None 27 self._data = "" 28 self._collect_data = False
Initialize and reset this instance.
If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters.
36 def handle_starttag(self, tag, attrs): 37 if self._collect_data and tag not in self._ignore_tags: 38 self._data += f"<{tag}>" 39 40 if tag in ["table", "th", "tr", "td", "caption"]: 41 self._data = "" 42 self._collect_data = True 43 if tag == "table": 44 heading = next((i for i in reversed(self._items) if isinstance(i, Heading)), None) 45 self._table = Table(heading=heading) 46 self._type = "t" 47 self._ty = -1 48 if self._table and tag == "tr": 49 self._ty += 1 50 self._tx = -1 51 if self._table and tag in ["th", "td"]: 52 self._tx += 1 53 tys = next((a[1] for a in attrs if a[0] == "rowspan"), 1) 54 txs = next((a[1] for a in attrs if a[0] == "colspan"), 1) 55 self._cell = Cell(self._tx, self._ty, int(txs), int(tys), tag == "th") 56 57 elif re.match(r"h[1-6]", tag): 58 self._data = "" 59 self._collect_data = True 60 self._type = "h" 61 62 elif self._type is None: 63 self._data = "" 64 self._collect_data = True 65 self._type = (tag, len(self._tags)) 66 67 self._tags.append(tag)
73 def handle_endtag(self, tag): 74 self._tags.pop() 75 76 if re.match(r"h[1-6]", tag): 77 self._items.append(Heading(self._clear_data())) 78 self._type = None 79 80 elif self._table and tag == "caption": 81 self._table._caption = Text(self._clear_data()) 82 83 elif self._cell and tag in ["th", "td"]: 84 self._cell.html = self._clear_data() 85 self._table._cells.append(self._cell) 86 self._cell = None 87 88 elif self._table and tag == "table": 89 if self._table._cells: 90 self._table._normalize() 91 if self._table.size > (1,1) or self._table.cell(0,0).html != "(omitted)": 92 self._items.append(self._table) 93 self._table = None 94 self._type = None 95 96 if self._type == (tag, len(self._tags)): 97 self._type = None 98 self._items.append(Text(self._clear_data())) 99 100 if self._collect_data and tag not in self._ignore_tags: 101 self._data += f"</{tag}>"
Inherited Members
- html.parser.HTMLParser
- CDATA_CONTENT_ELEMENTS
- convert_charrefs
- reset
- feed
- close
- get_starttag_text
- set_cdata_mode
- clear_cdata_mode
- goahead
- parse_html_declaration
- parse_bogus_comment
- parse_pi
- parse_starttag
- check_for_whole_start_tag
- parse_endtag
- handle_startendtag
- handle_charref
- handle_entityref
- handle_comment
- handle_decl
- handle_pi
- unknown_decl
- _markupbase.ParserBase
- getpos
- updatepos
- parse_declaration
- parse_marked_section
- parse_comment