modm_data.pdf2html.line

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4from functools import cached_property
  5from ..utils import Rectangle
  6from ..pdf import Character
  7
  8
  9class CharCluster:
 10    """
 11    A cluster of characters separated by less than two space-widths.
 12    These denote the use of additional white space that is not encoded in the
 13    character stream of the PDF page.
 14    """
 15
 16    def __init__(self, line: "CharLine", chars: list[Character]):
 17        self._line = line
 18        self.chars = chars
 19
 20    @cached_property
 21    def content(self) -> str:
 22        return "".join(c.char for c in self.chars)
 23
 24    @cached_property
 25    def bbox(self) -> Rectangle:
 26        return Rectangle(
 27            min(c.bbox.left for c in self.chars),
 28            min(c.bbox.bottom for c in self.chars),
 29            max(c.bbox.right for c in self.chars),
 30            max(c.bbox.top for c in self.chars),
 31        )
 32
 33
 34class CharLine:
 35    """
 36    A line of characters with super- and sub-script chars merged into.
 37    """
 38
 39    def __init__(
 40        self,
 41        page,
 42        chars: list,
 43        bottom: float,
 44        origin: float,
 45        top: float,
 46        height: float = None,
 47        rotation: int = 0,
 48        offset: float = 0,
 49        sort_origin: float = None,
 50    ):
 51        self._page = page
 52        self.chars = chars
 53        self.bottom = bottom
 54        self.origin = origin
 55        self.top = top
 56        self.height = height or (top - bottom)
 57        self.rotation = rotation
 58        self.offset = offset
 59        self._sort_origin = origin if sort_origin is None else sort_origin
 60
 61    @cached_property
 62    def bbox(self) -> Rectangle:
 63        """Bounding box of the character line"""
 64        return Rectangle(
 65            min(c.bbox.left for c in self.chars),
 66            min(c.bbox.bottom for c in self.chars),
 67            max(c.bbox.right for c in self.chars),
 68            max(c.bbox.top for c in self.chars),
 69        )
 70
 71    @cached_property
 72    def fonts(self) -> set[str]:
 73        """All font names in this character line"""
 74        return set(c.font for c in self.chars if c.font)
 75
 76    def contains_font(self, *fragments: str) -> bool:
 77        """:return: True if any fragment is part of the font names"""
 78        for fragment in fragments:
 79            if any(fragment in font for font in self.fonts):
 80                return True
 81        return False
 82
 83    @cached_property
 84    def content(self) -> str:
 85        """Text contained in the character line"""
 86        return "".join(c.char for c in self.chars)
 87
 88    def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]:
 89        """Find clusters of characters in a line separated by `absolute_tolerance`."""
 90
 91        def _cluster(clusters, chars):
 92            if chars:
 93                clusters.append(CharCluster(self, chars))
 94
 95        # We want to group the chars if the space between them is > 1em
 96        if absolute_tolerance is None:
 97            absolute_tolerance = self._page._spacing["x_em"] * 1
 98        clusters = []
 99        current_chars = [self.chars[0]]
100        last_char = current_chars[0]
101        for next_char in self.chars[1:]:
102            if next_char.bbox.left - last_char.bbox.right < absolute_tolerance:
103                # Keep this char in the current cluster
104                current_chars.append(next_char)
105                if next_char.unicode not in {0x20, 0xA, 0xD}:
106                    last_char = next_char
107            else:
108                # Larger spacing detected, create a new cluster
109                _cluster(clusters, current_chars)
110                current_chars = [next_char]
111                last_char = next_char
112        _cluster(clusters, current_chars)
113
114        return clusters
115
116    def __repr__(self) -> str:
117        return f"Line({len(self.chars)})"
class CharCluster:
10class CharCluster:
11    """
12    A cluster of characters separated by less than two space-widths.
13    These denote the use of additional white space that is not encoded in the
14    character stream of the PDF page.
15    """
16
17    def __init__(self, line: "CharLine", chars: list[Character]):
18        self._line = line
19        self.chars = chars
20
21    @cached_property
22    def content(self) -> str:
23        return "".join(c.char for c in self.chars)
24
25    @cached_property
26    def bbox(self) -> Rectangle:
27        return Rectangle(
28            min(c.bbox.left for c in self.chars),
29            min(c.bbox.bottom for c in self.chars),
30            max(c.bbox.right for c in self.chars),
31            max(c.bbox.top for c in self.chars),
32        )

A cluster of characters separated by less than two space-widths. These denote the use of additional white space that is not encoded in the character stream of the PDF page.

CharCluster( line: CharLine, chars: list[modm_data.pdf.Character])
17    def __init__(self, line: "CharLine", chars: list[Character]):
18        self._line = line
19        self.chars = chars
chars
content: str
21    @cached_property
22    def content(self) -> str:
23        return "".join(c.char for c in self.chars)
bbox: modm_data.utils.Rectangle
25    @cached_property
26    def bbox(self) -> Rectangle:
27        return Rectangle(
28            min(c.bbox.left for c in self.chars),
29            min(c.bbox.bottom for c in self.chars),
30            max(c.bbox.right for c in self.chars),
31            max(c.bbox.top for c in self.chars),
32        )
class CharLine:
 35class CharLine:
 36    """
 37    A line of characters with super- and sub-script chars merged into.
 38    """
 39
 40    def __init__(
 41        self,
 42        page,
 43        chars: list,
 44        bottom: float,
 45        origin: float,
 46        top: float,
 47        height: float = None,
 48        rotation: int = 0,
 49        offset: float = 0,
 50        sort_origin: float = None,
 51    ):
 52        self._page = page
 53        self.chars = chars
 54        self.bottom = bottom
 55        self.origin = origin
 56        self.top = top
 57        self.height = height or (top - bottom)
 58        self.rotation = rotation
 59        self.offset = offset
 60        self._sort_origin = origin if sort_origin is None else sort_origin
 61
 62    @cached_property
 63    def bbox(self) -> Rectangle:
 64        """Bounding box of the character line"""
 65        return Rectangle(
 66            min(c.bbox.left for c in self.chars),
 67            min(c.bbox.bottom for c in self.chars),
 68            max(c.bbox.right for c in self.chars),
 69            max(c.bbox.top for c in self.chars),
 70        )
 71
 72    @cached_property
 73    def fonts(self) -> set[str]:
 74        """All font names in this character line"""
 75        return set(c.font for c in self.chars if c.font)
 76
 77    def contains_font(self, *fragments: str) -> bool:
 78        """:return: True if any fragment is part of the font names"""
 79        for fragment in fragments:
 80            if any(fragment in font for font in self.fonts):
 81                return True
 82        return False
 83
 84    @cached_property
 85    def content(self) -> str:
 86        """Text contained in the character line"""
 87        return "".join(c.char for c in self.chars)
 88
 89    def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]:
 90        """Find clusters of characters in a line separated by `absolute_tolerance`."""
 91
 92        def _cluster(clusters, chars):
 93            if chars:
 94                clusters.append(CharCluster(self, chars))
 95
 96        # We want to group the chars if the space between them is > 1em
 97        if absolute_tolerance is None:
 98            absolute_tolerance = self._page._spacing["x_em"] * 1
 99        clusters = []
100        current_chars = [self.chars[0]]
101        last_char = current_chars[0]
102        for next_char in self.chars[1:]:
103            if next_char.bbox.left - last_char.bbox.right < absolute_tolerance:
104                # Keep this char in the current cluster
105                current_chars.append(next_char)
106                if next_char.unicode not in {0x20, 0xA, 0xD}:
107                    last_char = next_char
108            else:
109                # Larger spacing detected, create a new cluster
110                _cluster(clusters, current_chars)
111                current_chars = [next_char]
112                last_char = next_char
113        _cluster(clusters, current_chars)
114
115        return clusters
116
117    def __repr__(self) -> str:
118        return f"Line({len(self.chars)})"

A line of characters with super- and sub-script chars merged into.

CharLine( page, chars: list, bottom: float, origin: float, top: float, height: float = None, rotation: int = 0, offset: float = 0, sort_origin: float = None)
40    def __init__(
41        self,
42        page,
43        chars: list,
44        bottom: float,
45        origin: float,
46        top: float,
47        height: float = None,
48        rotation: int = 0,
49        offset: float = 0,
50        sort_origin: float = None,
51    ):
52        self._page = page
53        self.chars = chars
54        self.bottom = bottom
55        self.origin = origin
56        self.top = top
57        self.height = height or (top - bottom)
58        self.rotation = rotation
59        self.offset = offset
60        self._sort_origin = origin if sort_origin is None else sort_origin
chars
bottom
origin
top
height
rotation
offset
bbox: modm_data.utils.Rectangle
62    @cached_property
63    def bbox(self) -> Rectangle:
64        """Bounding box of the character line"""
65        return Rectangle(
66            min(c.bbox.left for c in self.chars),
67            min(c.bbox.bottom for c in self.chars),
68            max(c.bbox.right for c in self.chars),
69            max(c.bbox.top for c in self.chars),
70        )

Bounding box of the character line

fonts: set[str]
72    @cached_property
73    def fonts(self) -> set[str]:
74        """All font names in this character line"""
75        return set(c.font for c in self.chars if c.font)

All font names in this character line

def contains_font(self, *fragments: str) -> bool:
77    def contains_font(self, *fragments: str) -> bool:
78        """:return: True if any fragment is part of the font names"""
79        for fragment in fragments:
80            if any(fragment in font for font in self.fonts):
81                return True
82        return False
Returns

True if any fragment is part of the font names

content: str
84    @cached_property
85    def content(self) -> str:
86        """Text contained in the character line"""
87        return "".join(c.char for c in self.chars)

Text contained in the character line

def clusters( self, absolute_tolerance: float = None) -> list[CharCluster]:
 89    def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]:
 90        """Find clusters of characters in a line separated by `absolute_tolerance`."""
 91
 92        def _cluster(clusters, chars):
 93            if chars:
 94                clusters.append(CharCluster(self, chars))
 95
 96        # We want to group the chars if the space between them is > 1em
 97        if absolute_tolerance is None:
 98            absolute_tolerance = self._page._spacing["x_em"] * 1
 99        clusters = []
100        current_chars = [self.chars[0]]
101        last_char = current_chars[0]
102        for next_char in self.chars[1:]:
103            if next_char.bbox.left - last_char.bbox.right < absolute_tolerance:
104                # Keep this char in the current cluster
105                current_chars.append(next_char)
106                if next_char.unicode not in {0x20, 0xA, 0xD}:
107                    last_char = next_char
108            else:
109                # Larger spacing detected, create a new cluster
110                _cluster(clusters, current_chars)
111                current_chars = [next_char]
112                last_char = next_char
113        _cluster(clusters, current_chars)
114
115        return clusters

Find clusters of characters in a line separated by absolute_tolerance.