modm_data.pdf2html.line
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4from functools import cached_property 5from ..utils import Rectangle 6 7 8class CharCluster: 9 """ 10 A cluster of characters separated by less than two space-widths. 11 These denote the use of additional white space that is not encoded in the 12 character stream of the PDF page. 13 """ 14 15 def __init__(self, line, chars: list): 16 self._line = line 17 self.chars = chars 18 19 @cached_property 20 def content(self) -> str: 21 return "".join(c.char for c in self.chars) 22 23 @cached_property 24 def bbox(self) -> Rectangle: 25 return Rectangle(min(c.bbox.left for c in self.chars), 26 min(c.bbox.bottom for c in self.chars), 27 max(c.bbox.right for c in self.chars), 28 max(c.bbox.top for c in self.chars)) 29 30 31class CharLine: 32 """ 33 A line of characters with super- and sub-script chars merged into. 34 """ 35 36 def __init__(self, page, chars: list, bottom: float, 37 origin: float, top: float, 38 height: float = None, rotation: int = 0, 39 offset: float = 0, sort_origin: float = None): 40 self._page = page 41 self.chars = chars 42 self.bottom = bottom 43 self.origin = origin 44 self.top = top 45 self.height = height or (top - bottom) 46 self.rotation = rotation 47 self.offset = offset 48 self._sort_origin = origin if sort_origin is None else sort_origin 49 50 @cached_property 51 def bbox(self) -> Rectangle: 52 return Rectangle(min(c.bbox.left for c in self.chars), 53 min(c.bbox.bottom for c in self.chars), 54 max(c.bbox.right for c in self.chars), 55 max(c.bbox.top for c in self.chars)) 56 57 @cached_property 58 def fonts(self) -> set: 59 return set(c.font for c in self.chars if c.font) 60 61 def contains_font(self, *fragments) -> bool: 62 for fragment in fragments: 63 if any(fragment in font for font in self.fonts): 64 return True 65 return False 66 67 @cached_property 68 def content(self) -> str: 69 return "".join(c.char for c in self.chars) 70 71 def clusters(self, atol: float = None) -> list[CharCluster]: 72 # Find clusters of characters in a line incl. whitespace chars 73 def _cluster(clusters, chars): 74 if chars: 75 clusters.append(CharCluster(self, chars)) 76 77 # We want to group the chars if the space between them is > 1em 78 if atol is None: 79 atol = self._page._spacing["x_em"] * 1 80 clusters = [] 81 current_chars = [self.chars[0]] 82 last_char = current_chars[0] 83 for next_char in self.chars[1:]: 84 if next_char.bbox.left - last_char.bbox.right < atol: 85 # Keep this char in the current cluster 86 current_chars.append(next_char) 87 if next_char.unicode not in {0x20, 0xa, 0xd}: 88 last_char = next_char 89 else: 90 # Larger spacing detected, create a new cluster 91 _cluster(clusters, current_chars) 92 current_chars = [next_char] 93 last_char = next_char 94 _cluster(clusters, current_chars) 95 96 return clusters 97 98 def __repr__(self) -> str: 99 return f"Line({len(self.chars)})"
class
CharCluster:
9class CharCluster: 10 """ 11 A cluster of characters separated by less than two space-widths. 12 These denote the use of additional white space that is not encoded in the 13 character stream of the PDF page. 14 """ 15 16 def __init__(self, line, chars: list): 17 self._line = line 18 self.chars = chars 19 20 @cached_property 21 def content(self) -> str: 22 return "".join(c.char for c in self.chars) 23 24 @cached_property 25 def bbox(self) -> Rectangle: 26 return Rectangle(min(c.bbox.left for c in self.chars), 27 min(c.bbox.bottom for c in self.chars), 28 max(c.bbox.right for c in self.chars), 29 max(c.bbox.top for c in self.chars))
A cluster of characters separated by less than two space-widths. These denote the use of additional white space that is not encoded in the character stream of the PDF page.
class
CharLine:
32class CharLine: 33 """ 34 A line of characters with super- and sub-script chars merged into. 35 """ 36 37 def __init__(self, page, chars: list, bottom: float, 38 origin: float, top: float, 39 height: float = None, rotation: int = 0, 40 offset: float = 0, sort_origin: float = None): 41 self._page = page 42 self.chars = chars 43 self.bottom = bottom 44 self.origin = origin 45 self.top = top 46 self.height = height or (top - bottom) 47 self.rotation = rotation 48 self.offset = offset 49 self._sort_origin = origin if sort_origin is None else sort_origin 50 51 @cached_property 52 def bbox(self) -> Rectangle: 53 return Rectangle(min(c.bbox.left for c in self.chars), 54 min(c.bbox.bottom for c in self.chars), 55 max(c.bbox.right for c in self.chars), 56 max(c.bbox.top for c in self.chars)) 57 58 @cached_property 59 def fonts(self) -> set: 60 return set(c.font for c in self.chars if c.font) 61 62 def contains_font(self, *fragments) -> bool: 63 for fragment in fragments: 64 if any(fragment in font for font in self.fonts): 65 return True 66 return False 67 68 @cached_property 69 def content(self) -> str: 70 return "".join(c.char for c in self.chars) 71 72 def clusters(self, atol: float = None) -> list[CharCluster]: 73 # Find clusters of characters in a line incl. whitespace chars 74 def _cluster(clusters, chars): 75 if chars: 76 clusters.append(CharCluster(self, chars)) 77 78 # We want to group the chars if the space between them is > 1em 79 if atol is None: 80 atol = self._page._spacing["x_em"] * 1 81 clusters = [] 82 current_chars = [self.chars[0]] 83 last_char = current_chars[0] 84 for next_char in self.chars[1:]: 85 if next_char.bbox.left - last_char.bbox.right < atol: 86 # Keep this char in the current cluster 87 current_chars.append(next_char) 88 if next_char.unicode not in {0x20, 0xa, 0xd}: 89 last_char = next_char 90 else: 91 # Larger spacing detected, create a new cluster 92 _cluster(clusters, current_chars) 93 current_chars = [next_char] 94 last_char = next_char 95 _cluster(clusters, current_chars) 96 97 return clusters 98 99 def __repr__(self) -> str: 100 return f"Line({len(self.chars)})"
A line of characters with super- and sub-script chars merged into.
CharLine( page, chars: list, bottom: float, origin: float, top: float, height: float = None, rotation: int = 0, offset: float = 0, sort_origin: float = None)
37 def __init__(self, page, chars: list, bottom: float, 38 origin: float, top: float, 39 height: float = None, rotation: int = 0, 40 offset: float = 0, sort_origin: float = None): 41 self._page = page 42 self.chars = chars 43 self.bottom = bottom 44 self.origin = origin 45 self.top = top 46 self.height = height or (top - bottom) 47 self.rotation = rotation 48 self.offset = offset 49 self._sort_origin = origin if sort_origin is None else sort_origin
72 def clusters(self, atol: float = None) -> list[CharCluster]: 73 # Find clusters of characters in a line incl. whitespace chars 74 def _cluster(clusters, chars): 75 if chars: 76 clusters.append(CharCluster(self, chars)) 77 78 # We want to group the chars if the space between them is > 1em 79 if atol is None: 80 atol = self._page._spacing["x_em"] * 1 81 clusters = [] 82 current_chars = [self.chars[0]] 83 last_char = current_chars[0] 84 for next_char in self.chars[1:]: 85 if next_char.bbox.left - last_char.bbox.right < atol: 86 # Keep this char in the current cluster 87 current_chars.append(next_char) 88 if next_char.unicode not in {0x20, 0xa, 0xd}: 89 last_char = next_char 90 else: 91 # Larger spacing detected, create a new cluster 92 _cluster(clusters, current_chars) 93 current_chars = [next_char] 94 last_char = next_char 95 _cluster(clusters, current_chars) 96 97 return clusters