modm_data.pdf.page
PDF Pages
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF Pages 6 7 8""" 9 10import ctypes 11import logging 12import weakref 13from typing import Iterator, Callable 14from bisect import bisect_left, bisect_right 15from functools import cached_property, cache 16from collections import defaultdict, OrderedDict 17import pypdfium2 as pp 18 19from ..utils import Rectangle, Region 20from .character import Character 21from .link import ObjLink, WebLink 22from .graphics import Path, Image 23from .structure import Structure 24 25LOGGER = logging.getLogger(__name__) 26 27 28class Page(pp.PdfPage): 29 """ 30 This class provides low-level access to graphics and characters of the page. 31 It also fixes missing bounding boxes for rotates characters on page load, 32 as well as allow searching for characters in an area instead of just text. 33 """ 34 def __init__(self, document: "modm_data.pdf.Document", index: int): 35 """ 36 :param document: a PDF document. 37 :param index: 0-index page number. 38 """ 39 self.index = index 40 """0-index page number.""" 41 self.number = index + 1 42 """1-index page number.""" 43 44 super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv) 45 self._links = None 46 self._weblinks = None 47 self._linked = False 48 49 LOGGER.debug(f"Loading: {index}") 50 51 self._text = self.get_textpage() 52 self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) 53 self._structtree = pp.raw.FPDF_StructTree_GetForPage(self) 54 # close them in reverse order 55 weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree) 56 weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage) 57 58 self._fix_bboxes() 59 60 @cached_property 61 def label(self) -> str: 62 """The page label.""" 63 return self.pdf.get_page_label(self.index) 64 65 @cached_property 66 def width(self) -> float: 67 """The page width.""" 68 return self.get_width() 69 70 @cached_property 71 def height(self) -> float: 72 """The page height.""" 73 return self.get_height() 74 75 @cached_property 76 def rotation(self) -> int: 77 """The page rotation in degrees.""" 78 return self.get_rotation() 79 80 @cached_property 81 def bbox(self) -> Rectangle: 82 """The page bounding box.""" 83 return Rectangle(*self.get_bbox()) 84 85 @cached_property 86 def char_count(self) -> int: 87 """The total count of characters.""" 88 return self._text.count_chars() 89 90 @cache 91 def char(self, index: int) -> Character: 92 """:return: The character at the 0-index.""" 93 return Character(self, index) 94 95 @property 96 def chars(self) -> Iterator[Character]: 97 """Yields all characters.""" 98 for ii in range(self.char_count): 99 yield self.char(ii) 100 101 @cached_property 102 def objlinks(self) -> list[ObjLink]: 103 """All object links.""" 104 links = [] 105 pos = ctypes.c_int(0) 106 link = pp.raw.FPDF_LINK() 107 while pp.raw.FPDFLink_Enumerate(self, pos, link): 108 links.append(ObjLink(self, link)) 109 return links 110 111 @cached_property 112 def weblinks(self) -> list[WebLink]: 113 """All web links.""" 114 links = [] 115 for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)): 116 links.append(WebLink(self, ii)) 117 return links 118 119 def chars_in_area(self, area: Rectangle) -> list[Character]: 120 """ 121 :param area: area to search for character in. 122 :return: All characters found in the area. 123 """ 124 found = [] 125 # We perform binary searches of the lower and upper y-positions first 126 # lines are ordered by y-position 127 ypositions = list(self._charlines.keys()) 128 y_bottom = bisect_left(ypositions, area.bottom) 129 y_top = bisect_right(ypositions, area.top, lo=y_bottom) 130 131 # Then for every line we do another binary search for left and right 132 for ypos in ypositions[y_bottom:y_top]: 133 chars = self._charlines[ypos] 134 x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x) 135 x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x) 136 # Finally we add all these characters 137 found.extend(chars[x_left:x_right]) 138 return found 139 140 def text_in_area(self, area: Rectangle) -> str: 141 """ 142 :param area: area to search for text in. 143 :return: Only the text found in the area. 144 """ 145 return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top) 146 147 @property 148 def structures(self) -> Iterator[Structure]: 149 """The PDF/UA tags.""" 150 count = pp.raw.FPDF_StructTree_CountChildren(self._structtree) 151 for ii in range(count): 152 child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii) 153 yield Structure(self, child) 154 155 def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]: 156 """ 157 Searches for a match string as whole, consecutive words and yields the 158 characters. 159 160 :param string: The search string. 161 :param case_sensitive: Ignore case if false. 162 :return: yields the characters found. 163 """ 164 searcher = self._text.search(string, match_case=case_sensitive, 165 match_whole_word=True, consecutive=True) 166 while idx := searcher.get_next(): 167 chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])] 168 yield chars 169 170 @cached_property 171 def paths(self) -> list[Path]: 172 """All paths.""" 173 return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])] 174 175 @cached_property 176 def images(self) -> list[Image]: 177 """All images.""" 178 return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])] 179 180 def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None, 181 absolute_tolerance: float = None) -> \ 182 list[tuple[Rectangle, list[Path]]]: 183 if absolute_tolerance is None: 184 absolute_tolerance = min(self.width, self.height) * 0.01 185 186 # First collect all vertical regions 187 filtered_paths = [] 188 for path in self.paths: 189 if predicate is None or predicate(path): 190 filtered_paths.append(path) 191 for image in self.images: 192 if predicate is None or predicate(image): 193 filtered_paths.append(image) 194 195 regions = [] 196 for path in sorted(filtered_paths, key=lambda l: l.bbox.y): 197 for reg in regions: 198 if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance): 199 # They overlap, so merge them 200 reg.v0 = min(reg.v0, path.bbox.bottom) 201 reg.v1 = max(reg.v1, path.bbox.top) 202 reg.objs.append(path) 203 break 204 else: 205 regions.append(Region(path.bbox.bottom, path.bbox.top, path)) 206 207 # Now collect horizontal region inside each vertical region 208 for yreg in regions: 209 for path in sorted(filtered_paths, key=lambda l: l.bbox.x): 210 # check if horizontal line is contained in vregion 211 if yreg.contains(path.bbox.y, absolute_tolerance): 212 for xreg in yreg.subregions: 213 if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance): 214 # They overlap so merge them 215 xreg.v0 = min(xreg.v0, path.bbox.left) 216 xreg.v1 = max(xreg.v1, path.bbox.right) 217 xreg.objs.append(path) 218 break 219 else: 220 yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path)) 221 222 clusters = [] 223 for yreg in regions: 224 for xreg in yreg.subregions: 225 if len(yreg.subregions) > 1: 226 # Strip down the height again for subregions 227 y0, y1 = 1e9, 0 228 for path in xreg.objs: 229 y0 = min(y0, path.bbox.bottom) 230 y1 = max(y1, path.bbox.top) 231 else: 232 y0, y1 = yreg.v0, yreg.v1 233 bbox = Rectangle(xreg.v0, y0, xreg.v1, y1) 234 clusters.append((bbox, xreg.objs)) 235 236 return sorted(clusters, key=lambda c: (-c[0].y, c[0].x)) 237 238 239 def _link_characters(self): 240 if self._linked: 241 return 242 # The in-document links only gives us rectangles and we must find the 243 # linked chars ourselves 244 for link in self.objlinks: 245 for char in self.chars_in_area(link.bbox): 246 char.objlink = link 247 # The weblinks give you an explicit char range, very convenient 248 for link in self.weblinks: 249 for ii in range(*link.range): 250 self.char(ii).weblink = link 251 self._linked = True 252 253 @cached_property 254 def _charlines(self): 255 charlines = defaultdict(list) 256 for char in self.chars: 257 charlines[round(char.bbox.midpoint.y, 1)].append(char) 258 259 orderedchars = OrderedDict.fromkeys(sorted(charlines)) 260 for ypos, chars in charlines.items(): 261 orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x) 262 263 return orderedchars 264 265 def _fix_bboxes(self): 266 def _key(char): 267 height = round(char.tbbox.height, 1) 268 width = round(char.tbbox.width, 1) 269 return f"{char.font} {char.unicode} {height} {width}" 270 fix_chars = [] 271 for char in self.chars: 272 if not char._bbox.width or not char._bbox.height: 273 if char._rotation: 274 fix_chars.append(char) 275 elif char.unicode not in {0xa, 0xd}: 276 fix_chars.append(char) 277 elif (char.unicode not in {0xa, 0xd} and not char._rotation and 278 _key(char) not in self.pdf._bbox_cache): 279 bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation) 280 self.pdf._bbox_cache[_key(char)] = (char, bbox) 281 # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation) 282 for char in fix_chars: 283 bbox = self.pdf._bbox_cache.get(_key(char)) 284 if bbox is not None: 285 # print("<-", char.descr(), char._rotation, char.rotation, char.height) 286 _, bbox = bbox 287 bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin) 288 char._bbox = bbox 289 elif char.unicode not in {0x20, 0xa, 0xd}: 290 LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
LOGGER =
<Logger modm_data.pdf.page (WARNING)>
class
Page(pypdfium2._helpers.page.PdfPage):
29class Page(pp.PdfPage): 30 """ 31 This class provides low-level access to graphics and characters of the page. 32 It also fixes missing bounding boxes for rotates characters on page load, 33 as well as allow searching for characters in an area instead of just text. 34 """ 35 def __init__(self, document: "modm_data.pdf.Document", index: int): 36 """ 37 :param document: a PDF document. 38 :param index: 0-index page number. 39 """ 40 self.index = index 41 """0-index page number.""" 42 self.number = index + 1 43 """1-index page number.""" 44 45 super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv) 46 self._links = None 47 self._weblinks = None 48 self._linked = False 49 50 LOGGER.debug(f"Loading: {index}") 51 52 self._text = self.get_textpage() 53 self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) 54 self._structtree = pp.raw.FPDF_StructTree_GetForPage(self) 55 # close them in reverse order 56 weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree) 57 weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage) 58 59 self._fix_bboxes() 60 61 @cached_property 62 def label(self) -> str: 63 """The page label.""" 64 return self.pdf.get_page_label(self.index) 65 66 @cached_property 67 def width(self) -> float: 68 """The page width.""" 69 return self.get_width() 70 71 @cached_property 72 def height(self) -> float: 73 """The page height.""" 74 return self.get_height() 75 76 @cached_property 77 def rotation(self) -> int: 78 """The page rotation in degrees.""" 79 return self.get_rotation() 80 81 @cached_property 82 def bbox(self) -> Rectangle: 83 """The page bounding box.""" 84 return Rectangle(*self.get_bbox()) 85 86 @cached_property 87 def char_count(self) -> int: 88 """The total count of characters.""" 89 return self._text.count_chars() 90 91 @cache 92 def char(self, index: int) -> Character: 93 """:return: The character at the 0-index.""" 94 return Character(self, index) 95 96 @property 97 def chars(self) -> Iterator[Character]: 98 """Yields all characters.""" 99 for ii in range(self.char_count): 100 yield self.char(ii) 101 102 @cached_property 103 def objlinks(self) -> list[ObjLink]: 104 """All object links.""" 105 links = [] 106 pos = ctypes.c_int(0) 107 link = pp.raw.FPDF_LINK() 108 while pp.raw.FPDFLink_Enumerate(self, pos, link): 109 links.append(ObjLink(self, link)) 110 return links 111 112 @cached_property 113 def weblinks(self) -> list[WebLink]: 114 """All web links.""" 115 links = [] 116 for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)): 117 links.append(WebLink(self, ii)) 118 return links 119 120 def chars_in_area(self, area: Rectangle) -> list[Character]: 121 """ 122 :param area: area to search for character in. 123 :return: All characters found in the area. 124 """ 125 found = [] 126 # We perform binary searches of the lower and upper y-positions first 127 # lines are ordered by y-position 128 ypositions = list(self._charlines.keys()) 129 y_bottom = bisect_left(ypositions, area.bottom) 130 y_top = bisect_right(ypositions, area.top, lo=y_bottom) 131 132 # Then for every line we do another binary search for left and right 133 for ypos in ypositions[y_bottom:y_top]: 134 chars = self._charlines[ypos] 135 x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x) 136 x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x) 137 # Finally we add all these characters 138 found.extend(chars[x_left:x_right]) 139 return found 140 141 def text_in_area(self, area: Rectangle) -> str: 142 """ 143 :param area: area to search for text in. 144 :return: Only the text found in the area. 145 """ 146 return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top) 147 148 @property 149 def structures(self) -> Iterator[Structure]: 150 """The PDF/UA tags.""" 151 count = pp.raw.FPDF_StructTree_CountChildren(self._structtree) 152 for ii in range(count): 153 child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii) 154 yield Structure(self, child) 155 156 def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]: 157 """ 158 Searches for a match string as whole, consecutive words and yields the 159 characters. 160 161 :param string: The search string. 162 :param case_sensitive: Ignore case if false. 163 :return: yields the characters found. 164 """ 165 searcher = self._text.search(string, match_case=case_sensitive, 166 match_whole_word=True, consecutive=True) 167 while idx := searcher.get_next(): 168 chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])] 169 yield chars 170 171 @cached_property 172 def paths(self) -> list[Path]: 173 """All paths.""" 174 return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])] 175 176 @cached_property 177 def images(self) -> list[Image]: 178 """All images.""" 179 return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])] 180 181 def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None, 182 absolute_tolerance: float = None) -> \ 183 list[tuple[Rectangle, list[Path]]]: 184 if absolute_tolerance is None: 185 absolute_tolerance = min(self.width, self.height) * 0.01 186 187 # First collect all vertical regions 188 filtered_paths = [] 189 for path in self.paths: 190 if predicate is None or predicate(path): 191 filtered_paths.append(path) 192 for image in self.images: 193 if predicate is None or predicate(image): 194 filtered_paths.append(image) 195 196 regions = [] 197 for path in sorted(filtered_paths, key=lambda l: l.bbox.y): 198 for reg in regions: 199 if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance): 200 # They overlap, so merge them 201 reg.v0 = min(reg.v0, path.bbox.bottom) 202 reg.v1 = max(reg.v1, path.bbox.top) 203 reg.objs.append(path) 204 break 205 else: 206 regions.append(Region(path.bbox.bottom, path.bbox.top, path)) 207 208 # Now collect horizontal region inside each vertical region 209 for yreg in regions: 210 for path in sorted(filtered_paths, key=lambda l: l.bbox.x): 211 # check if horizontal line is contained in vregion 212 if yreg.contains(path.bbox.y, absolute_tolerance): 213 for xreg in yreg.subregions: 214 if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance): 215 # They overlap so merge them 216 xreg.v0 = min(xreg.v0, path.bbox.left) 217 xreg.v1 = max(xreg.v1, path.bbox.right) 218 xreg.objs.append(path) 219 break 220 else: 221 yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path)) 222 223 clusters = [] 224 for yreg in regions: 225 for xreg in yreg.subregions: 226 if len(yreg.subregions) > 1: 227 # Strip down the height again for subregions 228 y0, y1 = 1e9, 0 229 for path in xreg.objs: 230 y0 = min(y0, path.bbox.bottom) 231 y1 = max(y1, path.bbox.top) 232 else: 233 y0, y1 = yreg.v0, yreg.v1 234 bbox = Rectangle(xreg.v0, y0, xreg.v1, y1) 235 clusters.append((bbox, xreg.objs)) 236 237 return sorted(clusters, key=lambda c: (-c[0].y, c[0].x)) 238 239 240 def _link_characters(self): 241 if self._linked: 242 return 243 # The in-document links only gives us rectangles and we must find the 244 # linked chars ourselves 245 for link in self.objlinks: 246 for char in self.chars_in_area(link.bbox): 247 char.objlink = link 248 # The weblinks give you an explicit char range, very convenient 249 for link in self.weblinks: 250 for ii in range(*link.range): 251 self.char(ii).weblink = link 252 self._linked = True 253 254 @cached_property 255 def _charlines(self): 256 charlines = defaultdict(list) 257 for char in self.chars: 258 charlines[round(char.bbox.midpoint.y, 1)].append(char) 259 260 orderedchars = OrderedDict.fromkeys(sorted(charlines)) 261 for ypos, chars in charlines.items(): 262 orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x) 263 264 return orderedchars 265 266 def _fix_bboxes(self): 267 def _key(char): 268 height = round(char.tbbox.height, 1) 269 width = round(char.tbbox.width, 1) 270 return f"{char.font} {char.unicode} {height} {width}" 271 fix_chars = [] 272 for char in self.chars: 273 if not char._bbox.width or not char._bbox.height: 274 if char._rotation: 275 fix_chars.append(char) 276 elif char.unicode not in {0xa, 0xd}: 277 fix_chars.append(char) 278 elif (char.unicode not in {0xa, 0xd} and not char._rotation and 279 _key(char) not in self.pdf._bbox_cache): 280 bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation) 281 self.pdf._bbox_cache[_key(char)] = (char, bbox) 282 # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation) 283 for char in fix_chars: 284 bbox = self.pdf._bbox_cache.get(_key(char)) 285 if bbox is not None: 286 # print("<-", char.descr(), char._rotation, char.rotation, char.height) 287 _, bbox = bbox 288 bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin) 289 char._bbox = bbox 290 elif char.unicode not in {0x20, 0xa, 0xd}: 291 LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.
Page(document: modm_data.pdf.document.Document, index: int)
35 def __init__(self, document: "modm_data.pdf.Document", index: int): 36 """ 37 :param document: a PDF document. 38 :param index: 0-index page number. 39 """ 40 self.index = index 41 """0-index page number.""" 42 self.number = index + 1 43 """1-index page number.""" 44 45 super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv) 46 self._links = None 47 self._weblinks = None 48 self._linked = False 49 50 LOGGER.debug(f"Loading: {index}") 51 52 self._text = self.get_textpage() 53 self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) 54 self._structtree = pp.raw.FPDF_StructTree_GetForPage(self) 55 # close them in reverse order 56 weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree) 57 weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage) 58 59 self._fix_bboxes()
Parameters
- document: a PDF document.
- index: 0-index page number.
label: str
61 @cached_property 62 def label(self) -> str: 63 """The page label.""" 64 return self.pdf.get_page_label(self.index)
The page label.
width: float
66 @cached_property 67 def width(self) -> float: 68 """The page width.""" 69 return self.get_width()
The page width.
height: float
71 @cached_property 72 def height(self) -> float: 73 """The page height.""" 74 return self.get_height()
The page height.
rotation: int
76 @cached_property 77 def rotation(self) -> int: 78 """The page rotation in degrees.""" 79 return self.get_rotation()
The page rotation in degrees.
81 @cached_property 82 def bbox(self) -> Rectangle: 83 """The page bounding box.""" 84 return Rectangle(*self.get_bbox())
The page bounding box.
char_count: int
86 @cached_property 87 def char_count(self) -> int: 88 """The total count of characters.""" 89 return self._text.count_chars()
The total count of characters.
91 @cache 92 def char(self, index: int) -> Character: 93 """:return: The character at the 0-index.""" 94 return Character(self, index)
Returns
The character at the 0-index.
chars: Iterator[modm_data.pdf.character.Character]
96 @property 97 def chars(self) -> Iterator[Character]: 98 """Yields all characters.""" 99 for ii in range(self.char_count): 100 yield self.char(ii)
Yields all characters.
objlinks: list[modm_data.pdf.link.ObjLink]
102 @cached_property 103 def objlinks(self) -> list[ObjLink]: 104 """All object links.""" 105 links = [] 106 pos = ctypes.c_int(0) 107 link = pp.raw.FPDF_LINK() 108 while pp.raw.FPDFLink_Enumerate(self, pos, link): 109 links.append(ObjLink(self, link)) 110 return links
All object links.
weblinks: list[modm_data.pdf.link.WebLink]
112 @cached_property 113 def weblinks(self) -> list[WebLink]: 114 """All web links.""" 115 links = [] 116 for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)): 117 links.append(WebLink(self, ii)) 118 return links
All web links.
def
chars_in_area( self, area: modm_data.utils.math.Rectangle) -> list[modm_data.pdf.character.Character]:
120 def chars_in_area(self, area: Rectangle) -> list[Character]: 121 """ 122 :param area: area to search for character in. 123 :return: All characters found in the area. 124 """ 125 found = [] 126 # We perform binary searches of the lower and upper y-positions first 127 # lines are ordered by y-position 128 ypositions = list(self._charlines.keys()) 129 y_bottom = bisect_left(ypositions, area.bottom) 130 y_top = bisect_right(ypositions, area.top, lo=y_bottom) 131 132 # Then for every line we do another binary search for left and right 133 for ypos in ypositions[y_bottom:y_top]: 134 chars = self._charlines[ypos] 135 x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x) 136 x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x) 137 # Finally we add all these characters 138 found.extend(chars[x_left:x_right]) 139 return found
Parameters
- area: area to search for character in.
Returns
All characters found in the area.
141 def text_in_area(self, area: Rectangle) -> str: 142 """ 143 :param area: area to search for text in. 144 :return: Only the text found in the area. 145 """ 146 return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
Parameters
- area: area to search for text in.
Returns
Only the text found in the area.
structures: Iterator[modm_data.pdf.structure.Structure]
148 @property 149 def structures(self) -> Iterator[Structure]: 150 """The PDF/UA tags.""" 151 count = pp.raw.FPDF_StructTree_CountChildren(self._structtree) 152 for ii in range(count): 153 child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii) 154 yield Structure(self, child)
The PDF/UA tags.
def
find( self, string: str, case_sensitive: bool = True) -> Iterator[modm_data.pdf.character.Character]:
156 def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]: 157 """ 158 Searches for a match string as whole, consecutive words and yields the 159 characters. 160 161 :param string: The search string. 162 :param case_sensitive: Ignore case if false. 163 :return: yields the characters found. 164 """ 165 searcher = self._text.search(string, match_case=case_sensitive, 166 match_whole_word=True, consecutive=True) 167 while idx := searcher.get_next(): 168 chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])] 169 yield chars
Searches for a match string as whole, consecutive words and yields the characters.
Parameters
- string: The search string.
- case_sensitive: Ignore case if false.
Returns
yields the characters found.
paths: list[modm_data.pdf.graphics.Path]
171 @cached_property 172 def paths(self) -> list[Path]: 173 """All paths.""" 174 return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
All paths.
images: list[modm_data.pdf.graphics.Image]
176 @cached_property 177 def images(self) -> list[Image]: 178 """All images.""" 179 return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
All images.
def
graphic_clusters( self, predicate: Callable[[modm_data.pdf.graphics.Path | modm_data.pdf.graphics.Image], bool] = None, absolute_tolerance: float = None) -> list[tuple[modm_data.utils.math.Rectangle, list[modm_data.pdf.graphics.Path]]]:
181 def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None, 182 absolute_tolerance: float = None) -> \ 183 list[tuple[Rectangle, list[Path]]]: 184 if absolute_tolerance is None: 185 absolute_tolerance = min(self.width, self.height) * 0.01 186 187 # First collect all vertical regions 188 filtered_paths = [] 189 for path in self.paths: 190 if predicate is None or predicate(path): 191 filtered_paths.append(path) 192 for image in self.images: 193 if predicate is None or predicate(image): 194 filtered_paths.append(image) 195 196 regions = [] 197 for path in sorted(filtered_paths, key=lambda l: l.bbox.y): 198 for reg in regions: 199 if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance): 200 # They overlap, so merge them 201 reg.v0 = min(reg.v0, path.bbox.bottom) 202 reg.v1 = max(reg.v1, path.bbox.top) 203 reg.objs.append(path) 204 break 205 else: 206 regions.append(Region(path.bbox.bottom, path.bbox.top, path)) 207 208 # Now collect horizontal region inside each vertical region 209 for yreg in regions: 210 for path in sorted(filtered_paths, key=lambda l: l.bbox.x): 211 # check if horizontal line is contained in vregion 212 if yreg.contains(path.bbox.y, absolute_tolerance): 213 for xreg in yreg.subregions: 214 if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance): 215 # They overlap so merge them 216 xreg.v0 = min(xreg.v0, path.bbox.left) 217 xreg.v1 = max(xreg.v1, path.bbox.right) 218 xreg.objs.append(path) 219 break 220 else: 221 yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path)) 222 223 clusters = [] 224 for yreg in regions: 225 for xreg in yreg.subregions: 226 if len(yreg.subregions) > 1: 227 # Strip down the height again for subregions 228 y0, y1 = 1e9, 0 229 for path in xreg.objs: 230 y0 = min(y0, path.bbox.bottom) 231 y1 = max(y1, path.bbox.top) 232 else: 233 y0, y1 = yreg.v0, yreg.v1 234 bbox = Rectangle(xreg.v0, y0, xreg.v1, y1) 235 clusters.append((bbox, xreg.objs)) 236 237 return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
Inherited Members
- pypdfium2._helpers.page.PdfPage
- parent
- get_width
- get_height
- get_size
- get_rotation
- set_rotation
- get_mediabox
- set_mediabox
- get_cropbox
- set_cropbox
- get_bleedbox
- set_bleedbox
- get_trimbox
- set_trimbox
- get_artbox
- set_artbox
- get_bbox
- get_textpage
- insert_obj
- remove_obj
- gen_content
- get_objects
- render
- pypdfium2.internal.bases.AutoCloseable
- close