modm_data.pdf2html.line

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4from functools import cached_property
 5from ..utils import Rectangle
 6
 7
 8class CharCluster:
 9    """
10    A cluster of characters separated by less than two space-widths.
11    These denote the use of additional white space that is not encoded in the
12    character stream of the PDF page.
13    """
14
15    def __init__(self, line, chars: list):
16        self._line = line
17        self.chars = chars
18
19    @cached_property
20    def content(self) -> str:
21        return "".join(c.char for c in self.chars)
22
23    @cached_property
24    def bbox(self) -> Rectangle:
25        return Rectangle(min(c.bbox.left for c in self.chars),
26                         min(c.bbox.bottom for c in self.chars),
27                         max(c.bbox.right for c in self.chars),
28                         max(c.bbox.top for c in self.chars))
29
30
31class CharLine:
32    """
33    A line of characters with super- and sub-script chars merged into.
34    """
35
36    def __init__(self, page, chars: list, bottom: float,
37                 origin: float, top: float,
38                 height: float = None, rotation: int = 0,
39                 offset: float = 0, sort_origin: float = None):
40        self._page = page
41        self.chars = chars
42        self.bottom = bottom
43        self.origin = origin
44        self.top = top
45        self.height = height or (top - bottom)
46        self.rotation = rotation
47        self.offset = offset
48        self._sort_origin = origin if sort_origin is None else sort_origin
49
50    @cached_property
51    def bbox(self) -> Rectangle:
52        return Rectangle(min(c.bbox.left for c in self.chars),
53                         min(c.bbox.bottom for c in self.chars),
54                         max(c.bbox.right for c in self.chars),
55                         max(c.bbox.top for c in self.chars))
56
57    @cached_property
58    def fonts(self) -> set:
59        return set(c.font for c in self.chars if c.font)
60
61    def contains_font(self, *fragments) -> bool:
62        for fragment in fragments:
63            if any(fragment in font for font in self.fonts):
64                return True
65        return False
66
67    @cached_property
68    def content(self) -> str:
69        return "".join(c.char for c in self.chars)
70
71    def clusters(self, atol: float = None) -> list[CharCluster]:
72        # Find clusters of characters in a line incl. whitespace chars
73        def _cluster(clusters, chars):
74            if chars:
75                clusters.append(CharCluster(self, chars))
76
77        # We want to group the chars if the space between them is > 1em
78        if atol is None:
79            atol = self._page._spacing["x_em"] * 1
80        clusters = []
81        current_chars = [self.chars[0]]
82        last_char = current_chars[0]
83        for next_char in self.chars[1:]:
84            if next_char.bbox.left - last_char.bbox.right < atol:
85                # Keep this char in the current cluster
86                current_chars.append(next_char)
87                if next_char.unicode not in {0x20, 0xa, 0xd}:
88                    last_char = next_char
89            else:
90                # Larger spacing detected, create a new cluster
91                _cluster(clusters, current_chars)
92                current_chars = [next_char]
93                last_char = next_char
94        _cluster(clusters, current_chars)
95
96        return clusters
97
98    def __repr__(self) -> str:
99        return f"Line({len(self.chars)})"
class CharCluster:
 9class CharCluster:
10    """
11    A cluster of characters separated by less than two space-widths.
12    These denote the use of additional white space that is not encoded in the
13    character stream of the PDF page.
14    """
15
16    def __init__(self, line, chars: list):
17        self._line = line
18        self.chars = chars
19
20    @cached_property
21    def content(self) -> str:
22        return "".join(c.char for c in self.chars)
23
24    @cached_property
25    def bbox(self) -> Rectangle:
26        return Rectangle(min(c.bbox.left for c in self.chars),
27                         min(c.bbox.bottom for c in self.chars),
28                         max(c.bbox.right for c in self.chars),
29                         max(c.bbox.top for c in self.chars))

A cluster of characters separated by less than two space-widths. These denote the use of additional white space that is not encoded in the character stream of the PDF page.

CharCluster(line, chars: list)
16    def __init__(self, line, chars: list):
17        self._line = line
18        self.chars = chars
chars
content: str
20    @cached_property
21    def content(self) -> str:
22        return "".join(c.char for c in self.chars)
bbox: modm_data.utils.math.Rectangle
24    @cached_property
25    def bbox(self) -> Rectangle:
26        return Rectangle(min(c.bbox.left for c in self.chars),
27                         min(c.bbox.bottom for c in self.chars),
28                         max(c.bbox.right for c in self.chars),
29                         max(c.bbox.top for c in self.chars))
class CharLine:
 32class CharLine:
 33    """
 34    A line of characters with super- and sub-script chars merged into.
 35    """
 36
 37    def __init__(self, page, chars: list, bottom: float,
 38                 origin: float, top: float,
 39                 height: float = None, rotation: int = 0,
 40                 offset: float = 0, sort_origin: float = None):
 41        self._page = page
 42        self.chars = chars
 43        self.bottom = bottom
 44        self.origin = origin
 45        self.top = top
 46        self.height = height or (top - bottom)
 47        self.rotation = rotation
 48        self.offset = offset
 49        self._sort_origin = origin if sort_origin is None else sort_origin
 50
 51    @cached_property
 52    def bbox(self) -> Rectangle:
 53        return Rectangle(min(c.bbox.left for c in self.chars),
 54                         min(c.bbox.bottom for c in self.chars),
 55                         max(c.bbox.right for c in self.chars),
 56                         max(c.bbox.top for c in self.chars))
 57
 58    @cached_property
 59    def fonts(self) -> set:
 60        return set(c.font for c in self.chars if c.font)
 61
 62    def contains_font(self, *fragments) -> bool:
 63        for fragment in fragments:
 64            if any(fragment in font for font in self.fonts):
 65                return True
 66        return False
 67
 68    @cached_property
 69    def content(self) -> str:
 70        return "".join(c.char for c in self.chars)
 71
 72    def clusters(self, atol: float = None) -> list[CharCluster]:
 73        # Find clusters of characters in a line incl. whitespace chars
 74        def _cluster(clusters, chars):
 75            if chars:
 76                clusters.append(CharCluster(self, chars))
 77
 78        # We want to group the chars if the space between them is > 1em
 79        if atol is None:
 80            atol = self._page._spacing["x_em"] * 1
 81        clusters = []
 82        current_chars = [self.chars[0]]
 83        last_char = current_chars[0]
 84        for next_char in self.chars[1:]:
 85            if next_char.bbox.left - last_char.bbox.right < atol:
 86                # Keep this char in the current cluster
 87                current_chars.append(next_char)
 88                if next_char.unicode not in {0x20, 0xa, 0xd}:
 89                    last_char = next_char
 90            else:
 91                # Larger spacing detected, create a new cluster
 92                _cluster(clusters, current_chars)
 93                current_chars = [next_char]
 94                last_char = next_char
 95        _cluster(clusters, current_chars)
 96
 97        return clusters
 98
 99    def __repr__(self) -> str:
100        return f"Line({len(self.chars)})"

A line of characters with super- and sub-script chars merged into.

CharLine( page, chars: list, bottom: float, origin: float, top: float, height: float = None, rotation: int = 0, offset: float = 0, sort_origin: float = None)
37    def __init__(self, page, chars: list, bottom: float,
38                 origin: float, top: float,
39                 height: float = None, rotation: int = 0,
40                 offset: float = 0, sort_origin: float = None):
41        self._page = page
42        self.chars = chars
43        self.bottom = bottom
44        self.origin = origin
45        self.top = top
46        self.height = height or (top - bottom)
47        self.rotation = rotation
48        self.offset = offset
49        self._sort_origin = origin if sort_origin is None else sort_origin
chars
bottom
origin
top
height
rotation
offset
bbox: modm_data.utils.math.Rectangle
51    @cached_property
52    def bbox(self) -> Rectangle:
53        return Rectangle(min(c.bbox.left for c in self.chars),
54                         min(c.bbox.bottom for c in self.chars),
55                         max(c.bbox.right for c in self.chars),
56                         max(c.bbox.top for c in self.chars))
fonts: set
58    @cached_property
59    def fonts(self) -> set:
60        return set(c.font for c in self.chars if c.font)
def contains_font(self, *fragments) -> bool:
62    def contains_font(self, *fragments) -> bool:
63        for fragment in fragments:
64            if any(fragment in font for font in self.fonts):
65                return True
66        return False
content: str
68    @cached_property
69    def content(self) -> str:
70        return "".join(c.char for c in self.chars)
def clusters(self, atol: float = None) -> list[CharCluster]:
72    def clusters(self, atol: float = None) -> list[CharCluster]:
73        # Find clusters of characters in a line incl. whitespace chars
74        def _cluster(clusters, chars):
75            if chars:
76                clusters.append(CharCluster(self, chars))
77
78        # We want to group the chars if the space between them is > 1em
79        if atol is None:
80            atol = self._page._spacing["x_em"] * 1
81        clusters = []
82        current_chars = [self.chars[0]]
83        last_char = current_chars[0]
84        for next_char in self.chars[1:]:
85            if next_char.bbox.left - last_char.bbox.right < atol:
86                # Keep this char in the current cluster
87                current_chars.append(next_char)
88                if next_char.unicode not in {0x20, 0xa, 0xd}:
89                    last_char = next_char
90            else:
91                # Larger spacing detected, create a new cluster
92                _cluster(clusters, current_chars)
93                current_chars = [next_char]
94                last_char = next_char
95        _cluster(clusters, current_chars)
96
97        return clusters