modm_data.html.parser

View Source

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4import re
  5import os.path
  6import logging
  7from functools import cached_property
  8from html.parser import HTMLParser
  9from .table import Table, Cell
 10from .text import Text, Heading
 11
 12LOGGER = logging.getLogger(__name__)
 13
 14class Parser(HTMLParser):
 15    def __init__(self):
 16        super().__init__(convert_charrefs=True)
 17        self._items = []
 18        self._ignore_tags = ["span"]
 19        self._type = None
 20        self._tags = []
 21
 22        self._ty = -1
 23        self._tx = -1
 24        self._table = None
 25        self._cell = None
 26        self._data = ""
 27        self._collect_data = False
 28
 29    def _clear_data(self):
 30        self._collect_data = False
 31        data = self._data.replace("\n", "").replace("\r", "")
 32        data = data.strip()
 33        return data
 34
 35    def handle_starttag(self, tag, attrs):
 36        if self._collect_data and tag not in self._ignore_tags:
 37            self._data += f"<{tag}>"
 38
 39        if tag in ["table", "th", "tr", "td", "caption"]:
 40            self._data = ""
 41            self._collect_data = True
 42            if tag == "table":
 43                heading = next((i for i in reversed(self._items) if isinstance(i, Heading)), None)
 44                self._table = Table(heading=heading)
 45                self._type = "t"
 46                self._ty = -1
 47            if self._table and tag == "tr":
 48                self._ty += 1
 49                self._tx = -1
 50            if self._table and tag in ["th", "td"]:
 51                self._tx += 1
 52                tys = next((a[1] for a in  attrs if a[0] == "rowspan"), 1)
 53                txs = next((a[1] for a in  attrs if a[0] == "colspan"), 1)
 54                self._cell = Cell(self._tx, self._ty, int(txs), int(tys), tag == "th")
 55
 56        elif re.match(r"h[1-6]", tag):
 57            self._data = ""
 58            self._collect_data = True
 59            self._type = "h"
 60
 61        elif self._type is None:
 62            self._data = ""
 63            self._collect_data = True
 64            self._type = (tag, len(self._tags))
 65
 66        self._tags.append(tag)
 67
 68    def handle_data(self, data):
 69        if self._collect_data:
 70            self._data += data
 71
 72    def handle_endtag(self, tag):
 73        self._tags.pop()
 74
 75        if re.match(r"h[1-6]", tag):
 76            self._items.append(Heading(self._clear_data()))
 77            self._type = None
 78
 79        elif self._table and tag == "caption":
 80            self._table._caption = Text(self._clear_data())
 81
 82        elif self._cell and tag in ["th", "td"]:
 83            self._cell.html = self._clear_data()
 84            self._table._cells.append(self._cell)
 85            self._cell = None
 86
 87        elif self._table and tag == "table":
 88            if self._table._cells:
 89                self._table._normalize()
 90                if self._table.size > (1,1) or self._table.cell(0,0).html != "(omitted)":
 91                    self._items.append(self._table)
 92            self._table = None
 93            self._type = None
 94
 95        if self._type == (tag, len(self._tags)):
 96            self._type = None
 97            self._items.append(Text(self._clear_data()))
 98
 99        if self._collect_data and tag not in self._ignore_tags:
100            self._data += f"</{tag}>"

LOGGER = <Logger modm_data.html.parser (WARNING)>

class Parser(html.parser.HTMLParser): View Source

 15class Parser(HTMLParser):
 16    def __init__(self):
 17        super().__init__(convert_charrefs=True)
 18        self._items = []
 19        self._ignore_tags = ["span"]
 20        self._type = None
 21        self._tags = []
 22
 23        self._ty = -1
 24        self._tx = -1
 25        self._table = None
 26        self._cell = None
 27        self._data = ""
 28        self._collect_data = False
 29
 30    def _clear_data(self):
 31        self._collect_data = False
 32        data = self._data.replace("\n", "").replace("\r", "")
 33        data = data.strip()
 34        return data
 35
 36    def handle_starttag(self, tag, attrs):
 37        if self._collect_data and tag not in self._ignore_tags:
 38            self._data += f"<{tag}>"
 39
 40        if tag in ["table", "th", "tr", "td", "caption"]:
 41            self._data = ""
 42            self._collect_data = True
 43            if tag == "table":
 44                heading = next((i for i in reversed(self._items) if isinstance(i, Heading)), None)
 45                self._table = Table(heading=heading)
 46                self._type = "t"
 47                self._ty = -1
 48            if self._table and tag == "tr":
 49                self._ty += 1
 50                self._tx = -1
 51            if self._table and tag in ["th", "td"]:
 52                self._tx += 1
 53                tys = next((a[1] for a in  attrs if a[0] == "rowspan"), 1)
 54                txs = next((a[1] for a in  attrs if a[0] == "colspan"), 1)
 55                self._cell = Cell(self._tx, self._ty, int(txs), int(tys), tag == "th")
 56
 57        elif re.match(r"h[1-6]", tag):
 58            self._data = ""
 59            self._collect_data = True
 60            self._type = "h"
 61
 62        elif self._type is None:
 63            self._data = ""
 64            self._collect_data = True
 65            self._type = (tag, len(self._tags))
 66
 67        self._tags.append(tag)
 68
 69    def handle_data(self, data):
 70        if self._collect_data:
 71            self._data += data
 72
 73    def handle_endtag(self, tag):
 74        self._tags.pop()
 75
 76        if re.match(r"h[1-6]", tag):
 77            self._items.append(Heading(self._clear_data()))
 78            self._type = None
 79
 80        elif self._table and tag == "caption":
 81            self._table._caption = Text(self._clear_data())
 82
 83        elif self._cell and tag in ["th", "td"]:
 84            self._cell.html = self._clear_data()
 85            self._table._cells.append(self._cell)
 86            self._cell = None
 87
 88        elif self._table and tag == "table":
 89            if self._table._cells:
 90                self._table._normalize()
 91                if self._table.size > (1,1) or self._table.cell(0,0).html != "(omitted)":
 92                    self._items.append(self._table)
 93            self._table = None
 94            self._type = None
 95
 96        if self._type == (tag, len(self._tags)):
 97            self._type = None
 98            self._items.append(Text(self._clear_data()))
 99
100        if self._collect_data and tag not in self._ignore_tags:
101            self._data += f"</{tag}>"

Find tags and other markup and call handler functions.

Usage: p = HTMLParser() p.feed(data) ... p.close()

Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument.

Parser() View Source

16    def __init__(self):
17        super().__init__(convert_charrefs=True)
18        self._items = []
19        self._ignore_tags = ["span"]
20        self._type = None
21        self._tags = []
22
23        self._ty = -1
24        self._tx = -1
25        self._table = None
26        self._cell = None
27        self._data = ""
28        self._collect_data = False

Initialize and reset this instance.

If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters.

def handle_starttag(self, tag, attrs): View Source

36    def handle_starttag(self, tag, attrs):
37        if self._collect_data and tag not in self._ignore_tags:
38            self._data += f"<{tag}>"
39
40        if tag in ["table", "th", "tr", "td", "caption"]:
41            self._data = ""
42            self._collect_data = True
43            if tag == "table":
44                heading = next((i for i in reversed(self._items) if isinstance(i, Heading)), None)
45                self._table = Table(heading=heading)
46                self._type = "t"
47                self._ty = -1
48            if self._table and tag == "tr":
49                self._ty += 1
50                self._tx = -1
51            if self._table and tag in ["th", "td"]:
52                self._tx += 1
53                tys = next((a[1] for a in  attrs if a[0] == "rowspan"), 1)
54                txs = next((a[1] for a in  attrs if a[0] == "colspan"), 1)
55                self._cell = Cell(self._tx, self._ty, int(txs), int(tys), tag == "th")
56
57        elif re.match(r"h[1-6]", tag):
58            self._data = ""
59            self._collect_data = True
60            self._type = "h"
61
62        elif self._type is None:
63            self._data = ""
64            self._collect_data = True
65            self._type = (tag, len(self._tags))
66
67        self._tags.append(tag)

def handle_data(self, data): View Source

69    def handle_data(self, data):
70        if self._collect_data:
71            self._data += data

def handle_endtag(self, tag): View Source

 73    def handle_endtag(self, tag):
 74        self._tags.pop()
 75
 76        if re.match(r"h[1-6]", tag):
 77            self._items.append(Heading(self._clear_data()))
 78            self._type = None
 79
 80        elif self._table and tag == "caption":
 81            self._table._caption = Text(self._clear_data())
 82
 83        elif self._cell and tag in ["th", "td"]:
 84            self._cell.html = self._clear_data()
 85            self._table._cells.append(self._cell)
 86            self._cell = None
 87
 88        elif self._table and tag == "table":
 89            if self._table._cells:
 90                self._table._normalize()
 91                if self._table.size > (1,1) or self._table.cell(0,0).html != "(omitted)":
 92                    self._items.append(self._table)
 93            self._table = None
 94            self._type = None
 95
 96        if self._type == (tag, len(self._tags)):
 97            self._type = None
 98            self._items.append(Text(self._clear_data()))
 99
100        if self._collect_data and tag not in self._ignore_tags:
101            self._data += f"</{tag}>"

Inherited Members

html.parser.HTMLParser: CDATA_CONTENT_ELEMENTS; convert_charrefs; reset; feed; close; get_starttag_text; set_cdata_mode; clear_cdata_mode; goahead; parse_html_declaration; parse_bogus_comment; parse_pi; parse_starttag; check_for_whole_start_tag; parse_endtag; handle_startendtag; handle_charref; handle_entityref; handle_comment; handle_decl; handle_pi; unknown_decl
_markupbase.ParserBase: getpos; updatepos; parse_declaration; parse_marked_section; parse_comment