modm_data.pdf2html.line
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4from functools import cached_property 5from ..utils import Rectangle 6from ..pdf import Character 7 8 9class CharCluster: 10 """ 11 A cluster of characters separated by less than two space-widths. 12 These denote the use of additional white space that is not encoded in the 13 character stream of the PDF page. 14 """ 15 16 def __init__(self, line: "CharLine", chars: list[Character]): 17 self._line = line 18 self.chars = chars 19 20 @cached_property 21 def content(self) -> str: 22 return "".join(c.char for c in self.chars) 23 24 @cached_property 25 def bbox(self) -> Rectangle: 26 return Rectangle( 27 min(c.bbox.left for c in self.chars), 28 min(c.bbox.bottom for c in self.chars), 29 max(c.bbox.right for c in self.chars), 30 max(c.bbox.top for c in self.chars), 31 ) 32 33 34class CharLine: 35 """ 36 A line of characters with super- and sub-script chars merged into. 37 """ 38 39 def __init__( 40 self, 41 page, 42 chars: list, 43 bottom: float, 44 origin: float, 45 top: float, 46 height: float = None, 47 rotation: int = 0, 48 offset: float = 0, 49 sort_origin: float = None, 50 ): 51 self._page = page 52 self.chars = chars 53 self.bottom = bottom 54 self.origin = origin 55 self.top = top 56 self.height = height or (top - bottom) 57 self.rotation = rotation 58 self.offset = offset 59 self._sort_origin = origin if sort_origin is None else sort_origin 60 61 @cached_property 62 def bbox(self) -> Rectangle: 63 """Bounding box of the character line""" 64 return Rectangle( 65 min(c.bbox.left for c in self.chars), 66 min(c.bbox.bottom for c in self.chars), 67 max(c.bbox.right for c in self.chars), 68 max(c.bbox.top for c in self.chars), 69 ) 70 71 @cached_property 72 def fonts(self) -> set[str]: 73 """All font names in this character line""" 74 return set(c.font for c in self.chars if c.font) 75 76 def contains_font(self, *fragments: str) -> bool: 77 """:return: True if any fragment is part of the font names""" 78 for fragment in fragments: 79 if any(fragment in font for font in self.fonts): 80 return True 81 return False 82 83 @cached_property 84 def content(self) -> str: 85 """Text contained in the character line""" 86 return "".join(c.char for c in self.chars) 87 88 def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]: 89 """Find clusters of characters in a line separated by `absolute_tolerance`.""" 90 91 def _cluster(clusters, chars): 92 if chars: 93 clusters.append(CharCluster(self, chars)) 94 95 # We want to group the chars if the space between them is > 1em 96 if absolute_tolerance is None: 97 absolute_tolerance = self._page._spacing["x_em"] * 1 98 clusters = [] 99 current_chars = [self.chars[0]] 100 last_char = current_chars[0] 101 for next_char in self.chars[1:]: 102 if next_char.bbox.left - last_char.bbox.right < absolute_tolerance: 103 # Keep this char in the current cluster 104 current_chars.append(next_char) 105 if next_char.unicode not in {0x20, 0xA, 0xD}: 106 last_char = next_char 107 else: 108 # Larger spacing detected, create a new cluster 109 _cluster(clusters, current_chars) 110 current_chars = [next_char] 111 last_char = next_char 112 _cluster(clusters, current_chars) 113 114 return clusters 115 116 def __repr__(self) -> str: 117 return f"Line({len(self.chars)})"
class
CharCluster:
10class CharCluster: 11 """ 12 A cluster of characters separated by less than two space-widths. 13 These denote the use of additional white space that is not encoded in the 14 character stream of the PDF page. 15 """ 16 17 def __init__(self, line: "CharLine", chars: list[Character]): 18 self._line = line 19 self.chars = chars 20 21 @cached_property 22 def content(self) -> str: 23 return "".join(c.char for c in self.chars) 24 25 @cached_property 26 def bbox(self) -> Rectangle: 27 return Rectangle( 28 min(c.bbox.left for c in self.chars), 29 min(c.bbox.bottom for c in self.chars), 30 max(c.bbox.right for c in self.chars), 31 max(c.bbox.top for c in self.chars), 32 )
A cluster of characters separated by less than two space-widths. These denote the use of additional white space that is not encoded in the character stream of the PDF page.
CharCluster( line: CharLine, chars: list[modm_data.pdf.Character])
class
CharLine:
35class CharLine: 36 """ 37 A line of characters with super- and sub-script chars merged into. 38 """ 39 40 def __init__( 41 self, 42 page, 43 chars: list, 44 bottom: float, 45 origin: float, 46 top: float, 47 height: float = None, 48 rotation: int = 0, 49 offset: float = 0, 50 sort_origin: float = None, 51 ): 52 self._page = page 53 self.chars = chars 54 self.bottom = bottom 55 self.origin = origin 56 self.top = top 57 self.height = height or (top - bottom) 58 self.rotation = rotation 59 self.offset = offset 60 self._sort_origin = origin if sort_origin is None else sort_origin 61 62 @cached_property 63 def bbox(self) -> Rectangle: 64 """Bounding box of the character line""" 65 return Rectangle( 66 min(c.bbox.left for c in self.chars), 67 min(c.bbox.bottom for c in self.chars), 68 max(c.bbox.right for c in self.chars), 69 max(c.bbox.top for c in self.chars), 70 ) 71 72 @cached_property 73 def fonts(self) -> set[str]: 74 """All font names in this character line""" 75 return set(c.font for c in self.chars if c.font) 76 77 def contains_font(self, *fragments: str) -> bool: 78 """:return: True if any fragment is part of the font names""" 79 for fragment in fragments: 80 if any(fragment in font for font in self.fonts): 81 return True 82 return False 83 84 @cached_property 85 def content(self) -> str: 86 """Text contained in the character line""" 87 return "".join(c.char for c in self.chars) 88 89 def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]: 90 """Find clusters of characters in a line separated by `absolute_tolerance`.""" 91 92 def _cluster(clusters, chars): 93 if chars: 94 clusters.append(CharCluster(self, chars)) 95 96 # We want to group the chars if the space between them is > 1em 97 if absolute_tolerance is None: 98 absolute_tolerance = self._page._spacing["x_em"] * 1 99 clusters = [] 100 current_chars = [self.chars[0]] 101 last_char = current_chars[0] 102 for next_char in self.chars[1:]: 103 if next_char.bbox.left - last_char.bbox.right < absolute_tolerance: 104 # Keep this char in the current cluster 105 current_chars.append(next_char) 106 if next_char.unicode not in {0x20, 0xA, 0xD}: 107 last_char = next_char 108 else: 109 # Larger spacing detected, create a new cluster 110 _cluster(clusters, current_chars) 111 current_chars = [next_char] 112 last_char = next_char 113 _cluster(clusters, current_chars) 114 115 return clusters 116 117 def __repr__(self) -> str: 118 return f"Line({len(self.chars)})"
A line of characters with super- and sub-script chars merged into.
CharLine( page, chars: list, bottom: float, origin: float, top: float, height: float = None, rotation: int = 0, offset: float = 0, sort_origin: float = None)
40 def __init__( 41 self, 42 page, 43 chars: list, 44 bottom: float, 45 origin: float, 46 top: float, 47 height: float = None, 48 rotation: int = 0, 49 offset: float = 0, 50 sort_origin: float = None, 51 ): 52 self._page = page 53 self.chars = chars 54 self.bottom = bottom 55 self.origin = origin 56 self.top = top 57 self.height = height or (top - bottom) 58 self.rotation = rotation 59 self.offset = offset 60 self._sort_origin = origin if sort_origin is None else sort_origin
62 @cached_property 63 def bbox(self) -> Rectangle: 64 """Bounding box of the character line""" 65 return Rectangle( 66 min(c.bbox.left for c in self.chars), 67 min(c.bbox.bottom for c in self.chars), 68 max(c.bbox.right for c in self.chars), 69 max(c.bbox.top for c in self.chars), 70 )
Bounding box of the character line
fonts: set[str]
72 @cached_property 73 def fonts(self) -> set[str]: 74 """All font names in this character line""" 75 return set(c.font for c in self.chars if c.font)
All font names in this character line
def
contains_font(self, *fragments: str) -> bool:
77 def contains_font(self, *fragments: str) -> bool: 78 """:return: True if any fragment is part of the font names""" 79 for fragment in fragments: 80 if any(fragment in font for font in self.fonts): 81 return True 82 return False
Returns
True if any fragment is part of the font names
content: str
84 @cached_property 85 def content(self) -> str: 86 """Text contained in the character line""" 87 return "".join(c.char for c in self.chars)
Text contained in the character line
89 def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]: 90 """Find clusters of characters in a line separated by `absolute_tolerance`.""" 91 92 def _cluster(clusters, chars): 93 if chars: 94 clusters.append(CharCluster(self, chars)) 95 96 # We want to group the chars if the space between them is > 1em 97 if absolute_tolerance is None: 98 absolute_tolerance = self._page._spacing["x_em"] * 1 99 clusters = [] 100 current_chars = [self.chars[0]] 101 last_char = current_chars[0] 102 for next_char in self.chars[1:]: 103 if next_char.bbox.left - last_char.bbox.right < absolute_tolerance: 104 # Keep this char in the current cluster 105 current_chars.append(next_char) 106 if next_char.unicode not in {0x20, 0xA, 0xD}: 107 last_char = next_char 108 else: 109 # Larger spacing detected, create a new cluster 110 _cluster(clusters, current_chars) 111 current_chars = [next_char] 112 last_char = next_char 113 _cluster(clusters, current_chars) 114 115 return clusters
Find clusters of characters in a line separated by absolute_tolerance
.