modm_data.pdf
PDF Content Accessors
This module extends the pypdfium2 Python API with low-level accessors for characters and graphics. Note that these modules support read-only access to PDFs, since a lot of caching is used to speed up commonly accessed properties.
This module only contains formatting independent PDF access which is then
specialized in the vendor-specific modm_data.pdf2html
modules.
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF Content Accessors 6 7This module extends the pypdfium2 Python API with low-level accessors for 8characters and graphics. Note that these modules support read-only access to 9PDFs, since a lot of caching is used to speed up commonly accessed properties. 10 11This module only contains formatting independent PDF access which is then 12specialized in the vendor-specific `modm_data.pdf2html` modules. 13""" 14 15from .document import Document 16from .page import Page 17from .character import Character 18from .link import ObjLink, WebLink 19from .path import Path 20from .image import Image 21from .render import render_page_pdf 22from .structure import Structure 23 24__all__ = [ 25 "Document", 26 "Page", 27 "Character", 28 "ObjLink", 29 "WebLink", 30 "Path", 31 "Image", 32 "Structure", 33 "render_page_pdf", 34]
41class Document(pp.PdfDocument): 42 """ 43 This class is a convenience wrapper with caching around the high-level APIs 44 of pypdfium. 45 """ 46 47 def __init__(self, path: Path, autoclose: bool = False): 48 """ 49 :param path: Path to the PDF to open. 50 """ 51 path = Path(path) 52 self.name: str = path.stem 53 """Stem of the document file name""" 54 super().__init__(path, autoclose=autoclose) 55 self._path = path 56 self._bbox_cache = defaultdict(dict) 57 _LOGGER.debug(f"Loading: {path}") 58 59 @cached_property 60 def metadata(self) -> dict[str, str]: 61 """The PDF metadata dictionary.""" 62 return self.get_metadata_dict() 63 64 @property 65 def destinations(self) -> Iterator[tuple[int, str]]: 66 """Yields (page 0-index, named destination) of the whole document.""" 67 for ii in range(pp.raw.FPDF_CountNamedDests(self)): 68 length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0) 69 clength = ctypes.c_long(length) 70 cbuffer = ctypes.create_string_buffer(length * 2) 71 dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength) 72 name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00") 73 page = pp.raw.FPDFDest_GetDestPageIndex(self, dest) 74 yield (page, name) 75 76 @cached_property 77 def toc(self) -> list[pp.PdfOutlineItem]: 78 """ 79 The table of content as a sorted list of outline items ensuring item has 80 a page index by reusing the last one. 81 """ 82 tocs = set() 83 # Sometimes the TOC contains duplicates so we must use a set 84 last_page_index = 0 85 for toc in self.get_toc(): 86 outline = _OutlineItem( 87 toc.level, 88 toc.title, 89 toc.is_closed, 90 toc.n_kids, 91 toc.page_index or last_page_index, 92 toc.view_mode, 93 toc.view_pos, 94 ) 95 last_page_index = toc.page_index or last_page_index 96 tocs.add(outline) 97 return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title))) 98 99 @cached_property 100 def identifier_permanent(self) -> str: 101 """The permanent file identifier.""" 102 return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT) 103 104 @cached_property 105 def identifier_changing(self) -> str: 106 """The changing file identifier.""" 107 return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING) 108 109 @cached_property 110 def page_count(self) -> int: 111 """The number of pages in the document.""" 112 return pp.raw.FPDF_GetPageCount(self) 113 114 @cache 115 def page(self, index: int) -> Page: 116 """ 117 :param index: 0-indexed page number. 118 :return: the page object for the index. 119 """ 120 assert index < self.page_count 121 return Page(self, index) 122 123 def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]: 124 """ 125 :param numbers: an iterable range of page numbers (0-indexed!). 126 If `None`, then the whole page range is used. 127 :return: yields each page in the range. 128 """ 129 if numbers is None: 130 numbers = range(self.page_count) 131 for ii in numbers: 132 if 0 <= ii < self.page_count: 133 yield self.page(ii) 134 135 def __repr__(self) -> str: 136 return f"Doc({self.name})"
This class is a convenience wrapper with caching around the high-level APIs of pypdfium.
47 def __init__(self, path: Path, autoclose: bool = False): 48 """ 49 :param path: Path to the PDF to open. 50 """ 51 path = Path(path) 52 self.name: str = path.stem 53 """Stem of the document file name""" 54 super().__init__(path, autoclose=autoclose) 55 self._path = path 56 self._bbox_cache = defaultdict(dict) 57 _LOGGER.debug(f"Loading: {path}")
Parameters
- path: Path to the PDF to open.
59 @cached_property 60 def metadata(self) -> dict[str, str]: 61 """The PDF metadata dictionary.""" 62 return self.get_metadata_dict()
The PDF metadata dictionary.
64 @property 65 def destinations(self) -> Iterator[tuple[int, str]]: 66 """Yields (page 0-index, named destination) of the whole document.""" 67 for ii in range(pp.raw.FPDF_CountNamedDests(self)): 68 length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0) 69 clength = ctypes.c_long(length) 70 cbuffer = ctypes.create_string_buffer(length * 2) 71 dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength) 72 name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00") 73 page = pp.raw.FPDFDest_GetDestPageIndex(self, dest) 74 yield (page, name)
Yields (page 0-index, named destination) of the whole document.
76 @cached_property 77 def toc(self) -> list[pp.PdfOutlineItem]: 78 """ 79 The table of content as a sorted list of outline items ensuring item has 80 a page index by reusing the last one. 81 """ 82 tocs = set() 83 # Sometimes the TOC contains duplicates so we must use a set 84 last_page_index = 0 85 for toc in self.get_toc(): 86 outline = _OutlineItem( 87 toc.level, 88 toc.title, 89 toc.is_closed, 90 toc.n_kids, 91 toc.page_index or last_page_index, 92 toc.view_mode, 93 toc.view_pos, 94 ) 95 last_page_index = toc.page_index or last_page_index 96 tocs.add(outline) 97 return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
The table of content as a sorted list of outline items ensuring item has a page index by reusing the last one.
99 @cached_property 100 def identifier_permanent(self) -> str: 101 """The permanent file identifier.""" 102 return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
The permanent file identifier.
104 @cached_property 105 def identifier_changing(self) -> str: 106 """The changing file identifier.""" 107 return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
The changing file identifier.
109 @cached_property 110 def page_count(self) -> int: 111 """The number of pages in the document.""" 112 return pp.raw.FPDF_GetPageCount(self)
The number of pages in the document.
114 @cache 115 def page(self, index: int) -> Page: 116 """ 117 :param index: 0-indexed page number. 118 :return: the page object for the index. 119 """ 120 assert index < self.page_count 121 return Page(self, index)
Parameters
- index: 0-indexed page number.
Returns
the page object for the index.
123 def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]: 124 """ 125 :param numbers: an iterable range of page numbers (0-indexed!). 126 If `None`, then the whole page range is used. 127 :return: yields each page in the range. 128 """ 129 if numbers is None: 130 numbers = range(self.page_count) 131 for ii in numbers: 132 if 0 <= ii < self.page_count: 133 yield self.page(ii)
Parameters
- numbers: an iterable range of page numbers (0-indexed!).
If
None
, then the whole page range is used.
Returns
yields each page in the range.
Inherited Members
- pypdfium2._helpers.document.PdfDocument
- formenv
- parent
- new
- init_forms
- get_formtype
- get_pagemode
- is_tagged
- save
- get_identifier
- get_version
- get_metadata_value
- METADATA_KEYS
- get_metadata_dict
- count_attachments
- get_attachment
- new_attachment
- del_attachment
- get_page
- new_page
- del_page
- import_pages
- get_page_size
- get_page_label
- page_as_xobject
- get_toc
- render
- pypdfium2.internal.bases.AutoCloseable
- close
30class Page(pp.PdfPage): 31 """ 32 This class provides low-level access to graphics and characters of the page. 33 It also fixes missing bounding boxes for rotates characters on page load, 34 as well as allow searching for characters in an area instead of just text. 35 """ 36 37 def __init__(self, document: "modm_data.pdf.Document", index: int): # noqa: F821 38 """ 39 :param document: a PDF document. 40 :param index: 0-index page number. 41 """ 42 self.index = index 43 """0-index page number.""" 44 self.number = index + 1 45 """1-index page number.""" 46 47 super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv) 48 self._links = None 49 self._weblinks = None 50 self._linked = False 51 52 _LOGGER.debug(f"Loading: {index}") 53 54 self._text = self.get_textpage() 55 self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) 56 self._structtree = pp.raw.FPDF_StructTree_GetForPage(self) 57 # close them in reverse order 58 weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree) 59 weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage) 60 61 self._fix_bboxes() 62 63 @cached_property 64 def label(self) -> str: 65 """The page label.""" 66 return self.pdf.get_page_label(self.index) 67 68 @cached_property 69 def width(self) -> float: 70 """The page width.""" 71 return self.get_width() 72 73 @cached_property 74 def height(self) -> float: 75 """The page height.""" 76 return self.get_height() 77 78 @cached_property 79 def rotation(self) -> int: 80 """The page rotation in degrees.""" 81 return self.get_rotation() 82 83 @cached_property 84 def bbox(self) -> Rectangle: 85 """The page bounding box.""" 86 return Rectangle(*self.get_bbox()) 87 88 @cached_property 89 def char_count(self) -> int: 90 """The total count of characters.""" 91 return self._text.count_chars() 92 93 @cache 94 def char(self, index: int) -> Character: 95 """:return: The character at the 0-index.""" 96 return Character(self, index) 97 98 @property 99 def chars(self) -> Iterator[Character]: 100 """Yields all characters.""" 101 for ii in range(self.char_count): 102 yield self.char(ii) 103 104 @cached_property 105 def objlinks(self) -> list[ObjLink]: 106 """All object links.""" 107 links = [] 108 pos = ctypes.c_int(0) 109 link = pp.raw.FPDF_LINK() 110 while pp.raw.FPDFLink_Enumerate(self, pos, link): 111 links.append(ObjLink(self, link)) 112 return links 113 114 @cached_property 115 def weblinks(self) -> list[WebLink]: 116 """All web links.""" 117 links = [] 118 for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)): 119 links.append(WebLink(self, ii)) 120 return links 121 122 def chars_in_area(self, area: Rectangle) -> list[Character]: 123 """ 124 :param area: area to search for character in. 125 :return: All characters found in the area. 126 """ 127 found = [] 128 # We perform binary searches of the lower and upper y-positions first 129 # lines are ordered by y-position 130 ypositions = list(self._charlines.keys()) 131 y_bottom = bisect_left(ypositions, area.bottom) 132 y_top = bisect_right(ypositions, area.top, lo=y_bottom) 133 134 # Then for every line we do another binary search for left and right 135 for ypos in ypositions[y_bottom:y_top]: 136 chars = self._charlines[ypos] 137 x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x) 138 x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x) 139 # Finally we add all these characters 140 found.extend(chars[x_left:x_right]) 141 return found 142 143 def text_in_area(self, area: Rectangle) -> str: 144 """ 145 :param area: area to search for text in. 146 :return: Only the text found in the area. 147 """ 148 return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top) 149 150 @property 151 def structures(self) -> Iterator[Structure]: 152 """The PDF/UA tags.""" 153 count = pp.raw.FPDF_StructTree_CountChildren(self._structtree) 154 for ii in range(count): 155 child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii) 156 yield Structure(self, child) 157 158 def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]: 159 """ 160 Searches for a match string as whole, consecutive words and yields the 161 characters. 162 163 :param string: The search string. 164 :param case_sensitive: Ignore case if false. 165 :return: yields the characters found. 166 """ 167 searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True) 168 while idx := searcher.get_next(): 169 chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])] 170 yield chars 171 172 @cached_property 173 def paths(self) -> list[Path]: 174 """All paths.""" 175 return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])] 176 177 @cached_property 178 def images(self) -> list[Image]: 179 """All images.""" 180 return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])] 181 182 def graphic_clusters( 183 self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None 184 ) -> list[tuple[Rectangle, list[Path]]]: 185 if absolute_tolerance is None: 186 absolute_tolerance = min(self.width, self.height) * 0.01 187 188 # First collect all vertical regions 189 filtered_paths = [] 190 for path in self.paths: 191 if predicate is None or predicate(path): 192 filtered_paths.append(path) 193 for image in self.images: 194 if predicate is None or predicate(image): 195 filtered_paths.append(image) 196 197 regions = [] 198 for path in sorted(filtered_paths, key=lambda path: path.bbox.y): 199 for reg in regions: 200 if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance): 201 # They overlap, so merge them 202 reg.v0 = min(reg.v0, path.bbox.bottom) 203 reg.v1 = max(reg.v1, path.bbox.top) 204 reg.objs.append(path) 205 break 206 else: 207 regions.append(Region(path.bbox.bottom, path.bbox.top, path)) 208 209 # Now collect horizontal region inside each vertical region 210 for yreg in regions: 211 for path in sorted(filtered_paths, key=lambda path: path.bbox.x): 212 # check if horizontal line is contained in vregion 213 if yreg.contains(path.bbox.y, absolute_tolerance): 214 for xreg in yreg.subregions: 215 if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance): 216 # They overlap so merge them 217 xreg.v0 = min(xreg.v0, path.bbox.left) 218 xreg.v1 = max(xreg.v1, path.bbox.right) 219 xreg.objs.append(path) 220 break 221 else: 222 yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path)) 223 224 clusters = [] 225 for yreg in regions: 226 for xreg in yreg.subregions: 227 if len(yreg.subregions) > 1: 228 # Strip down the height again for subregions 229 y0, y1 = 1e9, 0 230 for path in xreg.objs: 231 y0 = min(y0, path.bbox.bottom) 232 y1 = max(y1, path.bbox.top) 233 else: 234 y0, y1 = yreg.v0, yreg.v1 235 bbox = Rectangle(xreg.v0, y0, xreg.v1, y1) 236 clusters.append((bbox, xreg.objs)) 237 238 return sorted(clusters, key=lambda c: (-c[0].y, c[0].x)) 239 240 def _link_characters(self): 241 if self._linked: 242 return 243 # The in-document links only gives us rectangles and we must find the 244 # linked chars ourselves 245 for link in self.objlinks: 246 for char in self.chars_in_area(link.bbox): 247 char.objlink = link 248 # The weblinks give you an explicit char range, very convenient 249 for link in self.weblinks: 250 for ii in range(*link.range): 251 self.char(ii).weblink = link 252 self._linked = True 253 254 @cached_property 255 def _charlines(self): 256 charlines = defaultdict(list) 257 for char in self.chars: 258 charlines[round(char.bbox.midpoint.y, 1)].append(char) 259 260 orderedchars = OrderedDict.fromkeys(sorted(charlines)) 261 for ypos, chars in charlines.items(): 262 orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x) 263 264 return orderedchars 265 266 def _fix_bboxes(self): 267 def _key(char): 268 height = round(char.tbbox.height, 1) 269 width = round(char.tbbox.width, 1) 270 return f"{char.font} {char.unicode} {height} {width}" 271 272 fix_chars = [] 273 for char in self.chars: 274 if not char._bbox.width or not char._bbox.height: 275 if char._rotation: 276 fix_chars.append(char) 277 elif char.unicode not in {0xA, 0xD}: 278 fix_chars.append(char) 279 elif char.unicode not in {0xA, 0xD} and not char._rotation and _key(char) not in self.pdf._bbox_cache: 280 bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation) 281 self.pdf._bbox_cache[_key(char)] = (char, bbox) 282 # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation) 283 for char in fix_chars: 284 bbox = self.pdf._bbox_cache.get(_key(char)) 285 if bbox is not None: 286 # print("<-", char.descr(), char._rotation, char.rotation, char.height) 287 _, bbox = bbox 288 bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin) 289 char._bbox = bbox 290 elif char.unicode not in {0x20, 0xA, 0xD}: 291 _LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.
37 def __init__(self, document: "modm_data.pdf.Document", index: int): # noqa: F821 38 """ 39 :param document: a PDF document. 40 :param index: 0-index page number. 41 """ 42 self.index = index 43 """0-index page number.""" 44 self.number = index + 1 45 """1-index page number.""" 46 47 super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv) 48 self._links = None 49 self._weblinks = None 50 self._linked = False 51 52 _LOGGER.debug(f"Loading: {index}") 53 54 self._text = self.get_textpage() 55 self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) 56 self._structtree = pp.raw.FPDF_StructTree_GetForPage(self) 57 # close them in reverse order 58 weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree) 59 weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage) 60 61 self._fix_bboxes()
Parameters
- document: a PDF document.
- index: 0-index page number.
63 @cached_property 64 def label(self) -> str: 65 """The page label.""" 66 return self.pdf.get_page_label(self.index)
The page label.
68 @cached_property 69 def width(self) -> float: 70 """The page width.""" 71 return self.get_width()
The page width.
73 @cached_property 74 def height(self) -> float: 75 """The page height.""" 76 return self.get_height()
The page height.
78 @cached_property 79 def rotation(self) -> int: 80 """The page rotation in degrees.""" 81 return self.get_rotation()
The page rotation in degrees.
83 @cached_property 84 def bbox(self) -> Rectangle: 85 """The page bounding box.""" 86 return Rectangle(*self.get_bbox())
The page bounding box.
88 @cached_property 89 def char_count(self) -> int: 90 """The total count of characters.""" 91 return self._text.count_chars()
The total count of characters.
93 @cache 94 def char(self, index: int) -> Character: 95 """:return: The character at the 0-index.""" 96 return Character(self, index)
Returns
The character at the 0-index.
98 @property 99 def chars(self) -> Iterator[Character]: 100 """Yields all characters.""" 101 for ii in range(self.char_count): 102 yield self.char(ii)
Yields all characters.
104 @cached_property 105 def objlinks(self) -> list[ObjLink]: 106 """All object links.""" 107 links = [] 108 pos = ctypes.c_int(0) 109 link = pp.raw.FPDF_LINK() 110 while pp.raw.FPDFLink_Enumerate(self, pos, link): 111 links.append(ObjLink(self, link)) 112 return links
All object links.
114 @cached_property 115 def weblinks(self) -> list[WebLink]: 116 """All web links.""" 117 links = [] 118 for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)): 119 links.append(WebLink(self, ii)) 120 return links
All web links.
122 def chars_in_area(self, area: Rectangle) -> list[Character]: 123 """ 124 :param area: area to search for character in. 125 :return: All characters found in the area. 126 """ 127 found = [] 128 # We perform binary searches of the lower and upper y-positions first 129 # lines are ordered by y-position 130 ypositions = list(self._charlines.keys()) 131 y_bottom = bisect_left(ypositions, area.bottom) 132 y_top = bisect_right(ypositions, area.top, lo=y_bottom) 133 134 # Then for every line we do another binary search for left and right 135 for ypos in ypositions[y_bottom:y_top]: 136 chars = self._charlines[ypos] 137 x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x) 138 x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x) 139 # Finally we add all these characters 140 found.extend(chars[x_left:x_right]) 141 return found
Parameters
- area: area to search for character in.
Returns
All characters found in the area.
143 def text_in_area(self, area: Rectangle) -> str: 144 """ 145 :param area: area to search for text in. 146 :return: Only the text found in the area. 147 """ 148 return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
Parameters
- area: area to search for text in.
Returns
Only the text found in the area.
150 @property 151 def structures(self) -> Iterator[Structure]: 152 """The PDF/UA tags.""" 153 count = pp.raw.FPDF_StructTree_CountChildren(self._structtree) 154 for ii in range(count): 155 child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii) 156 yield Structure(self, child)
The PDF/UA tags.
158 def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]: 159 """ 160 Searches for a match string as whole, consecutive words and yields the 161 characters. 162 163 :param string: The search string. 164 :param case_sensitive: Ignore case if false. 165 :return: yields the characters found. 166 """ 167 searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True) 168 while idx := searcher.get_next(): 169 chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])] 170 yield chars
Searches for a match string as whole, consecutive words and yields the characters.
Parameters
- string: The search string.
- case_sensitive: Ignore case if false.
Returns
yields the characters found.
172 @cached_property 173 def paths(self) -> list[Path]: 174 """All paths.""" 175 return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
All paths.
177 @cached_property 178 def images(self) -> list[Image]: 179 """All images.""" 180 return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
All images.
182 def graphic_clusters( 183 self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None 184 ) -> list[tuple[Rectangle, list[Path]]]: 185 if absolute_tolerance is None: 186 absolute_tolerance = min(self.width, self.height) * 0.01 187 188 # First collect all vertical regions 189 filtered_paths = [] 190 for path in self.paths: 191 if predicate is None or predicate(path): 192 filtered_paths.append(path) 193 for image in self.images: 194 if predicate is None or predicate(image): 195 filtered_paths.append(image) 196 197 regions = [] 198 for path in sorted(filtered_paths, key=lambda path: path.bbox.y): 199 for reg in regions: 200 if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance): 201 # They overlap, so merge them 202 reg.v0 = min(reg.v0, path.bbox.bottom) 203 reg.v1 = max(reg.v1, path.bbox.top) 204 reg.objs.append(path) 205 break 206 else: 207 regions.append(Region(path.bbox.bottom, path.bbox.top, path)) 208 209 # Now collect horizontal region inside each vertical region 210 for yreg in regions: 211 for path in sorted(filtered_paths, key=lambda path: path.bbox.x): 212 # check if horizontal line is contained in vregion 213 if yreg.contains(path.bbox.y, absolute_tolerance): 214 for xreg in yreg.subregions: 215 if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance): 216 # They overlap so merge them 217 xreg.v0 = min(xreg.v0, path.bbox.left) 218 xreg.v1 = max(xreg.v1, path.bbox.right) 219 xreg.objs.append(path) 220 break 221 else: 222 yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path)) 223 224 clusters = [] 225 for yreg in regions: 226 for xreg in yreg.subregions: 227 if len(yreg.subregions) > 1: 228 # Strip down the height again for subregions 229 y0, y1 = 1e9, 0 230 for path in xreg.objs: 231 y0 = min(y0, path.bbox.bottom) 232 y1 = max(y1, path.bbox.top) 233 else: 234 y0, y1 = yreg.v0, yreg.v1 235 bbox = Rectangle(xreg.v0, y0, xreg.v1, y1) 236 clusters.append((bbox, xreg.objs)) 237 238 return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
Inherited Members
- pypdfium2._helpers.page.PdfPage
- parent
- get_width
- get_height
- get_size
- get_rotation
- set_rotation
- get_mediabox
- set_mediabox
- get_cropbox
- set_cropbox
- get_bleedbox
- set_bleedbox
- get_trimbox
- set_trimbox
- get_artbox
- set_artbox
- get_bbox
- get_textpage
- insert_obj
- remove_obj
- gen_content
- get_objects
- render
- pypdfium2.internal.bases.AutoCloseable
- close
28class Character: 29 """ 30 This class contains all information about a single character in the PDF 31 page. 32 """ 33 34 class RenderMode(Enum): 35 """Tells the PDF viewer how to render this character glyph.""" 36 37 UNKNOWN = -1 38 FILL = 0 39 STROKE = 1 40 FILL_STROKE = 2 41 INVISIBLE = 3 42 FILL_CLIP = 4 43 STROKE_CLIP = 5 44 FILL_STROKE_CLIP = 6 45 CLIP = 7 46 47 def __init__(self, page: "modm_data.pdf.page.Page", index: int): # noqa: F821 48 """ 49 :param page: The page containing the character. 50 :param index: The index of the character. 51 """ 52 self._page = page 53 self._text = page._text 54 self._index = index 55 self._font = None 56 self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index))) 57 58 self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index) 59 """The unicode value of the character.""" 60 self.objlink: "modm_data.pdf.link.ObjLink" = None # noqa: F821 61 """The object link of this character or `None`""" 62 self.weblink: "modm_data.pdf.link.WebLink" = None # noqa: F821 63 """The web link of this character or `None`""" 64 65 bbox = Rectangle(*self._text.get_charbox(self._index, loose=True)) 66 if self._page.rotation: 67 bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 68 self._bbox = bbox 69 70 def _font_flags(self) -> tuple[str, int]: 71 if self._font is None: 72 font = ctypes.create_string_buffer(255) 73 flags = ctypes.c_int() 74 pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags) 75 self._font = (font.value.decode("utf-8"), flags.value) 76 return self._font 77 78 @property 79 def char(self) -> str: 80 """The printable string of the unicode value.""" 81 char = chr(self.unicode) 82 return char if char.isprintable() else "" 83 84 @cached_property 85 def origin(self) -> Point: 86 """The origin of the character.""" 87 x, y = ctypes.c_double(), ctypes.c_double() 88 assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y) 89 if self._page.rotation: 90 return Point(y.value, self._page.height - x.value) 91 return Point(x.value, y.value) 92 93 @cached_property 94 def width(self) -> float: 95 """The width of the character's bounding box.""" 96 if self.rotation: 97 return self.bbox.height 98 return self.bbox.width 99 100 @cached_property 101 def height(self) -> float: 102 """The height of the character's bounding box.""" 103 if self.rotation: 104 return self.bbox.width 105 return self.bbox.height 106 107 @cached_property 108 def tbbox(self) -> Rectangle: 109 """The tight bounding box of the character.""" 110 tbbox = Rectangle(*self._text.get_charbox(self._index)) 111 if self._page.rotation: 112 tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x) 113 return tbbox 114 115 @property 116 def bbox(self) -> Rectangle: 117 """ 118 The loose bounding box of the character. 119 .. note:: 120 If the loose bounding box is not available, the tight bounding box 121 is used instead. 122 """ 123 if not self._bbox.width or not self._bbox.height: 124 return self.tbbox 125 return self._bbox 126 127 @cached_property 128 def twidth(self) -> float: 129 """The width of the character's tight bounding box.""" 130 return self.tbbox.width 131 132 @cached_property 133 def theight(self) -> float: 134 """The height of the character's tight bounding box.""" 135 return self.tbbox.height 136 137 @cached_property 138 def render_mode(self) -> RenderMode: 139 """The render mode of the character.""" 140 return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index)) 141 142 @cached_property 143 def rotation(self) -> int: 144 """The rotation of the character in degrees modulo 360.""" 145 # Special case for vertical text in rotated pages 146 if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}: 147 return 90 148 if self._page.rotation and self._rotation: 149 return (self._page.rotation + self._rotation) % 360 150 return self._rotation 151 152 @cached_property 153 def size(self) -> float: 154 """The font size of the character.""" 155 return pp.raw.FPDFText_GetFontSize(self._text, self._index) 156 157 @cached_property 158 def weight(self) -> int: 159 """The font weight of the character.""" 160 return pp.raw.FPDFText_GetFontWeight(self._text, self._index) 161 162 @cached_property 163 def fill(self) -> int: 164 """The fill color of the character.""" 165 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 166 pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a) 167 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 168 169 @cached_property 170 def stroke(self) -> int: 171 """The stroke color of the character.""" 172 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 173 pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a) 174 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 175 176 @cached_property 177 def font(self) -> str: 178 """The font name of the character.""" 179 return self._font_flags()[0] 180 181 @cached_property 182 def flags(self) -> int: 183 """The font flags of the character.""" 184 return self._font_flags()[1] 185 186 def descr(self) -> str: 187 """Human-readable description of the character for debugging.""" 188 char = chr(self.unicode) 189 if not char.isprintable(): 190 char = hex(self.unicode) 191 return ( 192 f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " 193 f"{self.render_mode}, {self.font}, {hex(self.flags)}, " 194 f"{self.fill}, {self.stroke}, {repr(self.bbox)})" 195 ) 196 197 def __str__(self) -> str: 198 return self.char 199 200 def __repr__(self) -> str: 201 char = chr(self.unicode) 202 escape = {0xA: "\\n", 0xD: "\\r", 0x9: "\\t", 0x20: "␣"} 203 char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode)) 204 return char
This class contains all information about a single character in the PDF page.
47 def __init__(self, page: "modm_data.pdf.page.Page", index: int): # noqa: F821 48 """ 49 :param page: The page containing the character. 50 :param index: The index of the character. 51 """ 52 self._page = page 53 self._text = page._text 54 self._index = index 55 self._font = None 56 self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index))) 57 58 self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index) 59 """The unicode value of the character.""" 60 self.objlink: "modm_data.pdf.link.ObjLink" = None # noqa: F821 61 """The object link of this character or `None`""" 62 self.weblink: "modm_data.pdf.link.WebLink" = None # noqa: F821 63 """The web link of this character or `None`""" 64 65 bbox = Rectangle(*self._text.get_charbox(self._index, loose=True)) 66 if self._page.rotation: 67 bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 68 self._bbox = bbox
Parameters
- page: The page containing the character.
- index: The index of the character.
78 @property 79 def char(self) -> str: 80 """The printable string of the unicode value.""" 81 char = chr(self.unicode) 82 return char if char.isprintable() else ""
The printable string of the unicode value.
84 @cached_property 85 def origin(self) -> Point: 86 """The origin of the character.""" 87 x, y = ctypes.c_double(), ctypes.c_double() 88 assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y) 89 if self._page.rotation: 90 return Point(y.value, self._page.height - x.value) 91 return Point(x.value, y.value)
The origin of the character.
93 @cached_property 94 def width(self) -> float: 95 """The width of the character's bounding box.""" 96 if self.rotation: 97 return self.bbox.height 98 return self.bbox.width
The width of the character's bounding box.
100 @cached_property 101 def height(self) -> float: 102 """The height of the character's bounding box.""" 103 if self.rotation: 104 return self.bbox.width 105 return self.bbox.height
The height of the character's bounding box.
107 @cached_property 108 def tbbox(self) -> Rectangle: 109 """The tight bounding box of the character.""" 110 tbbox = Rectangle(*self._text.get_charbox(self._index)) 111 if self._page.rotation: 112 tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x) 113 return tbbox
The tight bounding box of the character.
115 @property 116 def bbox(self) -> Rectangle: 117 """ 118 The loose bounding box of the character. 119 .. note:: 120 If the loose bounding box is not available, the tight bounding box 121 is used instead. 122 """ 123 if not self._bbox.width or not self._bbox.height: 124 return self.tbbox 125 return self._bbox
The loose bounding box of the character.
If the loose bounding box is not available, the tight bounding box is used instead.
127 @cached_property 128 def twidth(self) -> float: 129 """The width of the character's tight bounding box.""" 130 return self.tbbox.width
The width of the character's tight bounding box.
132 @cached_property 133 def theight(self) -> float: 134 """The height of the character's tight bounding box.""" 135 return self.tbbox.height
The height of the character's tight bounding box.
137 @cached_property 138 def render_mode(self) -> RenderMode: 139 """The render mode of the character.""" 140 return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))
The render mode of the character.
142 @cached_property 143 def rotation(self) -> int: 144 """The rotation of the character in degrees modulo 360.""" 145 # Special case for vertical text in rotated pages 146 if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}: 147 return 90 148 if self._page.rotation and self._rotation: 149 return (self._page.rotation + self._rotation) % 360 150 return self._rotation
The rotation of the character in degrees modulo 360.
152 @cached_property 153 def size(self) -> float: 154 """The font size of the character.""" 155 return pp.raw.FPDFText_GetFontSize(self._text, self._index)
The font size of the character.
157 @cached_property 158 def weight(self) -> int: 159 """The font weight of the character.""" 160 return pp.raw.FPDFText_GetFontWeight(self._text, self._index)
The font weight of the character.
162 @cached_property 163 def fill(self) -> int: 164 """The fill color of the character.""" 165 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 166 pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a) 167 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The fill color of the character.
169 @cached_property 170 def stroke(self) -> int: 171 """The stroke color of the character.""" 172 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 173 pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a) 174 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The stroke color of the character.
176 @cached_property 177 def font(self) -> str: 178 """The font name of the character.""" 179 return self._font_flags()[0]
The font name of the character.
181 @cached_property 182 def flags(self) -> int: 183 """The font flags of the character.""" 184 return self._font_flags()[1]
The font flags of the character.
186 def descr(self) -> str: 187 """Human-readable description of the character for debugging.""" 188 char = chr(self.unicode) 189 if not char.isprintable(): 190 char = hex(self.unicode) 191 return ( 192 f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " 193 f"{self.render_mode}, {self.font}, {hex(self.flags)}, " 194 f"{self.fill}, {self.stroke}, {repr(self.bbox)})" 195 )
Human-readable description of the character for debugging.
34 class RenderMode(Enum): 35 """Tells the PDF viewer how to render this character glyph.""" 36 37 UNKNOWN = -1 38 FILL = 0 39 STROKE = 1 40 FILL_STROKE = 2 41 INVISIBLE = 3 42 FILL_CLIP = 4 43 STROKE_CLIP = 5 44 FILL_STROKE_CLIP = 6 45 CLIP = 7
Tells the PDF viewer how to render this character glyph.
Inherited Members
- enum.Enum
- name
- value
22class ObjLink: 23 """A link to a PDF object giving the bounding box and destination page.""" 24 25 def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK): # noqa: F821 26 """ 27 :param page: Page containing the link, used to compute bounding box. 28 :param link: Raw link object. 29 """ 30 self._page = page 31 self._dest = pp.raw.FPDFLink_GetDest(page.pdf, link) 32 33 bbox = pp.raw.FS_RECTF() 34 assert pp.raw.FPDFLink_GetAnnotRect(link, bbox) 35 bbox = Rectangle(bbox) 36 if page.rotation: 37 bbox = Rectangle(bbox.p0.y, page.height - bbox.p1.x, bbox.p1.y, page.height - bbox.p0.x) 38 self.bbox: Rectangle = bbox 39 """Bounding box of the link source""" 40 41 @cached_property 42 def page_index(self) -> int: 43 """0-indexed page number of the link destination.""" 44 return pp.raw.FPDFDest_GetDestPageIndex(self._page.pdf, self._dest) 45 46 def __repr__(self) -> str: 47 return f"Obj({self.page_index})"
A link to a PDF object giving the bounding box and destination page.
25 def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK): # noqa: F821 26 """ 27 :param page: Page containing the link, used to compute bounding box. 28 :param link: Raw link object. 29 """ 30 self._page = page 31 self._dest = pp.raw.FPDFLink_GetDest(page.pdf, link) 32 33 bbox = pp.raw.FS_RECTF() 34 assert pp.raw.FPDFLink_GetAnnotRect(link, bbox) 35 bbox = Rectangle(bbox) 36 if page.rotation: 37 bbox = Rectangle(bbox.p0.y, page.height - bbox.p1.x, bbox.p1.y, page.height - bbox.p0.x) 38 self.bbox: Rectangle = bbox 39 """Bounding box of the link source"""
Parameters
- page: Page containing the link, used to compute bounding box.
- link: Raw link object.
50class WebLink: 51 """A weblink object giving the bounding box and destination URL.""" 52 53 def __init__(self, page: "modm_data.pdf.Page", index: int): # noqa: F821 54 """ 55 :param page: Page containing the link, used to compute bounding box. 56 :param index: 0-index of the weblink object. 57 """ 58 self._page = page 59 self._link = page._linkpage 60 self._index = index 61 62 @cached_property 63 def bbox_count(self) -> int: 64 """The number of bounding boxes associated with this weblink.""" 65 return pp.raw.FPDFLink_CountRects(self._link, self._index) 66 67 @cached_property 68 def bboxes(self) -> list[Rectangle]: 69 """The bounding boxes associated with this weblink.""" 70 bboxes = [] 71 for ii in range(self.bbox_count): 72 x0, y0 = ctypes.c_double(), ctypes.c_double() 73 x1, y1 = ctypes.c_double(), ctypes.c_double() 74 assert pp.raw.FPDFLink_GetRect(self._link, self._index, ii, x0, y1, x1, y0) 75 bboxes.append(Rectangle(x0.value, y0.value, x1.value, y1.value)) 76 if self._page.rotation: 77 bboxes = [ 78 Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 79 for bbox in bboxes 80 ] 81 return bboxes 82 83 @cached_property 84 def range(self) -> tuple[int, int]: 85 """Start and end index of the characters associated with this link.""" 86 cstart = ctypes.c_int() 87 ccount = ctypes.c_int() 88 assert pp.raw.FPDFLink_GetTextRange(self._link, self._index, cstart, ccount) 89 return (cstart.value, cstart.value + ccount.value) 90 91 @cached_property 92 def url(self) -> str: 93 """The URL string of this link.""" 94 length = 1000 95 cbuffer = ctypes.c_ushort * length 96 cbuffer = cbuffer() 97 retlen = pp.raw.FPDFLink_GetURL(self._link, self._index, cbuffer, length) 98 assert retlen < length 99 return bytes(cbuffer).decode("utf-16-le").strip("\x00") 100 101 def __repr__(self) -> str: 102 return f"Url({self.url})"
A weblink object giving the bounding box and destination URL.
53 def __init__(self, page: "modm_data.pdf.Page", index: int): # noqa: F821 54 """ 55 :param page: Page containing the link, used to compute bounding box. 56 :param index: 0-index of the weblink object. 57 """ 58 self._page = page 59 self._link = page._linkpage 60 self._index = index
Parameters
- page: Page containing the link, used to compute bounding box.
- index: 0-index of the weblink object.
62 @cached_property 63 def bbox_count(self) -> int: 64 """The number of bounding boxes associated with this weblink.""" 65 return pp.raw.FPDFLink_CountRects(self._link, self._index)
The number of bounding boxes associated with this weblink.
67 @cached_property 68 def bboxes(self) -> list[Rectangle]: 69 """The bounding boxes associated with this weblink.""" 70 bboxes = [] 71 for ii in range(self.bbox_count): 72 x0, y0 = ctypes.c_double(), ctypes.c_double() 73 x1, y1 = ctypes.c_double(), ctypes.c_double() 74 assert pp.raw.FPDFLink_GetRect(self._link, self._index, ii, x0, y1, x1, y0) 75 bboxes.append(Rectangle(x0.value, y0.value, x1.value, y1.value)) 76 if self._page.rotation: 77 bboxes = [ 78 Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 79 for bbox in bboxes 80 ] 81 return bboxes
The bounding boxes associated with this weblink.
83 @cached_property 84 def range(self) -> tuple[int, int]: 85 """Start and end index of the characters associated with this link.""" 86 cstart = ctypes.c_int() 87 ccount = ctypes.c_int() 88 assert pp.raw.FPDFLink_GetTextRange(self._link, self._index, cstart, ccount) 89 return (cstart.value, cstart.value + ccount.value)
Start and end index of the characters associated with this link.
91 @cached_property 92 def url(self) -> str: 93 """The URL string of this link.""" 94 length = 1000 95 cbuffer = ctypes.c_ushort * length 96 cbuffer = cbuffer() 97 retlen = pp.raw.FPDFLink_GetURL(self._link, self._index, cbuffer, length) 98 assert retlen < length 99 return bytes(cbuffer).decode("utf-16-le").strip("\x00")
The URL string of this link.
20class Path(pp.PdfObject): 21 """ 22 This class specializes `pypdfium2.PdfObject` to add accessors for graphics 23 containing vector paths of various configurations. 24 25 You must construct the paths by calling `modm_data.pdf.page.Page.paths`. 26 """ 27 28 class Type(Enum): 29 """Path Type""" 30 31 LINE = 0 32 BEZIER = 1 33 MOVE = 2 34 35 class Cap(Enum): 36 """Path Cap Type""" 37 38 BUTT = 0 39 ROUND = 1 40 PROJECTING_SQUARE = 2 41 42 class Join(Enum): 43 """Path Join Type""" 44 45 MITER = 0 46 ROUND = 1 47 BEVEL = 2 48 49 # Overwrite the PdfPageObject.__new__ function 50 def __new__(cls, *args, **kwargs): 51 return object.__new__(cls) 52 53 def __init__(self, obj): 54 """ 55 :param obj: PDF object of the path. 56 """ 57 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 58 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH 59 self.type = pp.raw.FPDF_PAGEOBJ_PATH 60 61 @cached_property 62 def matrix(self) -> pp.PdfMatrix: 63 """The transformation matrix.""" 64 return self.get_matrix() 65 66 @cached_property 67 def count(self) -> int: 68 """Number of segments in this path.""" 69 return pp.raw.FPDFPath_CountSegments(self) 70 71 @cached_property 72 def fill(self) -> int: 73 """The fill color encoded as 32-bit RGBA.""" 74 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 75 assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a) 76 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 77 78 @cached_property 79 def stroke(self) -> int: 80 """The stroke color encoded as 32-bit RGBA.""" 81 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 82 assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a) 83 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 84 85 @cached_property 86 def width(self) -> float: 87 """The stroke width.""" 88 width = ctypes.c_float() 89 assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width) 90 return width.value 91 92 @cached_property 93 def cap(self) -> Cap: 94 """Line cap type.""" 95 return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self)) 96 97 @cached_property 98 def join(self) -> Join: 99 """Line join type.""" 100 return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self)) 101 102 @cached_property 103 def bbox(self) -> Rectangle: 104 """ 105 Bounding box of the path. 106 .. warning:: 107 The bounding is only approximated using the control points! 108 Therefore bezier curves will likely have a larger bounding box. 109 """ 110 left, bottom = ctypes.c_float(), ctypes.c_float() 111 right, top = ctypes.c_float(), ctypes.c_float() 112 assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top) 113 bbox = Rectangle(left.value, bottom.value, right.value, top.value) 114 if self.page.rotation: 115 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 116 return bbox 117 118 @cached_property 119 def points(self) -> list[Point]: 120 """ 121 List of points of the path. If the path is closed, the first point is 122 added to the end of the list. 123 """ 124 points = [] 125 for ii in range(self.count): 126 seg = pp.raw.FPDFPath_GetPathSegment(self, ii) 127 ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg)) 128 # The first point should always be MOVETO 129 assert ii or ptype == Path.Type.MOVE 130 131 x, y = ctypes.c_float(), ctypes.c_float() 132 assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y) 133 x, y = self.matrix.on_point(x.value, y.value) 134 points.append(Point(x, y, type=ptype)) 135 136 if pp.raw.FPDFPathSegment_GetClose(seg): 137 points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE)) 138 139 if self.page.rotation: 140 points = [Point(y, self.page.height - x, type=p.type) for p in points] 141 return points 142 143 @cached_property 144 def lines(self) -> list[Line]: 145 """List of lines between the path points.""" 146 points = self.points 147 return [ 148 Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type) 149 for ii in range(len(points) - 1) 150 ] 151 152 def __repr__(self) -> str: 153 points = ",".join(repr(p) for p in self.points) 154 return f"P{self.count}={points}"
This class specializes pypdfium2.PdfObject
to add accessors for graphics
containing vector paths of various configurations.
You must construct the paths by calling modm_data.pdf.page.Page.paths
.
53 def __init__(self, obj): 54 """ 55 :param obj: PDF object of the path. 56 """ 57 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 58 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH 59 self.type = pp.raw.FPDF_PAGEOBJ_PATH
Parameters
- obj: PDF object of the path.
61 @cached_property 62 def matrix(self) -> pp.PdfMatrix: 63 """The transformation matrix.""" 64 return self.get_matrix()
The transformation matrix.
66 @cached_property 67 def count(self) -> int: 68 """Number of segments in this path.""" 69 return pp.raw.FPDFPath_CountSegments(self)
Number of segments in this path.
71 @cached_property 72 def fill(self) -> int: 73 """The fill color encoded as 32-bit RGBA.""" 74 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 75 assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a) 76 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The fill color encoded as 32-bit RGBA.
78 @cached_property 79 def stroke(self) -> int: 80 """The stroke color encoded as 32-bit RGBA.""" 81 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 82 assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a) 83 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The stroke color encoded as 32-bit RGBA.
85 @cached_property 86 def width(self) -> float: 87 """The stroke width.""" 88 width = ctypes.c_float() 89 assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width) 90 return width.value
The stroke width.
92 @cached_property 93 def cap(self) -> Cap: 94 """Line cap type.""" 95 return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self))
Line cap type.
97 @cached_property 98 def join(self) -> Join: 99 """Line join type.""" 100 return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self))
Line join type.
102 @cached_property 103 def bbox(self) -> Rectangle: 104 """ 105 Bounding box of the path. 106 .. warning:: 107 The bounding is only approximated using the control points! 108 Therefore bezier curves will likely have a larger bounding box. 109 """ 110 left, bottom = ctypes.c_float(), ctypes.c_float() 111 right, top = ctypes.c_float(), ctypes.c_float() 112 assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top) 113 bbox = Rectangle(left.value, bottom.value, right.value, top.value) 114 if self.page.rotation: 115 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 116 return bbox
Bounding box of the path.
The bounding is only approximated using the control points! Therefore bezier curves will likely have a larger bounding box.
118 @cached_property 119 def points(self) -> list[Point]: 120 """ 121 List of points of the path. If the path is closed, the first point is 122 added to the end of the list. 123 """ 124 points = [] 125 for ii in range(self.count): 126 seg = pp.raw.FPDFPath_GetPathSegment(self, ii) 127 ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg)) 128 # The first point should always be MOVETO 129 assert ii or ptype == Path.Type.MOVE 130 131 x, y = ctypes.c_float(), ctypes.c_float() 132 assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y) 133 x, y = self.matrix.on_point(x.value, y.value) 134 points.append(Point(x, y, type=ptype)) 135 136 if pp.raw.FPDFPathSegment_GetClose(seg): 137 points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE)) 138 139 if self.page.rotation: 140 points = [Point(y, self.page.height - x, type=p.type) for p in points] 141 return points
List of points of the path. If the path is closed, the first point is added to the end of the list.
143 @cached_property 144 def lines(self) -> list[Line]: 145 """List of lines between the path points.""" 146 points = self.points 147 return [ 148 Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type) 149 for ii in range(len(points) - 1) 150 ]
List of lines between the path points.
Inherited Members
- pypdfium2._helpers.pageobjects.PdfObject
- parent
- get_pos
- get_matrix
- set_matrix
- transform
- pypdfium2.internal.bases.AutoCloseable
- close
Path Type
Inherited Members
- enum.Enum
- name
- value
Path Cap Type
Inherited Members
- enum.Enum
- name
- value
Path Join Type
Inherited Members
- enum.Enum
- name
- value
16class Image(pp.PdfImage): 17 """ 18 This class extends `pypdfium2.PdfImage` to align it with the interface of 19 the `Path` class so that it can be used in the same 20 algorithms without filtering. 21 22 You must construct the images by calling `modm_data.pdf.page.Page.images`. 23 24 .. note:: Images are currently ignored. 25 """ 26 27 # Overwrite the PdfPageObject.__new__ function 28 def __new__(cls, *args, **kwargs): 29 return object.__new__(cls) 30 31 def __init__(self, obj): 32 """ 33 :param obj: Page object of the image. 34 """ 35 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 36 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE 37 self.type = pp.raw.FPDF_PAGEOBJ_IMAGE 38 39 self.count: int = 4 40 """Number of segments. Always 4 due to rectangular image form. 41 (For compatibility with `Path.count`.)""" 42 self.stroke: int = 0 43 """The border stroke color. Always 0. 44 (For compatibility with `Path.stroke`.)""" 45 self.fill: int = 0 46 """The image fill color. Always 0. 47 (For compatibility with `Path.fill`.)""" 48 self.width: float = 0 49 """The border line width. Always 0. 50 (For compatibility with `Path.width`.)""" 51 52 @cached_property 53 def matrix(self) -> pp.PdfMatrix: 54 """The transformation matrix.""" 55 return self.get_matrix() 56 57 @cached_property 58 def bbox(self) -> Rectangle: 59 """The bounding box of the image.""" 60 bbox = Rectangle(*self.get_pos()) 61 if self.page.rotation: 62 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 63 return bbox 64 65 @cached_property 66 def points(self) -> list[Point]: 67 """ 68 The 4 points of the bounding box. 69 (For compatibility with `Path.points`.) 70 """ 71 points = self.bbox.points 72 if self.page.rotation: 73 points = [Point(p.y, self.page.height - p.x, p.type) for p in points] 74 return points 75 76 @cached_property 77 def lines(self) -> list[Line]: 78 """ 79 The 4 lines of the bounding box. 80 (For compatibility with `Path.lines`.) 81 """ 82 p = self.points 83 return [ 84 Line(p[0], p[1], p[1].type, 0), 85 Line(p[1], p[2], p[2].type, 0), 86 Line(p[2], p[3], p[3].type, 0), 87 Line(p[3], p[0], p[0].type, 0), 88 ] 89 90 def __repr__(self) -> str: 91 return f"I{self.bbox}"
This class extends pypdfium2.PdfImage
to align it with the interface of
the Path
class so that it can be used in the same
algorithms without filtering.
You must construct the images by calling modm_data.pdf.page.Page.images
.
Images are currently ignored.
31 def __init__(self, obj): 32 """ 33 :param obj: Page object of the image. 34 """ 35 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 36 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE 37 self.type = pp.raw.FPDF_PAGEOBJ_IMAGE 38 39 self.count: int = 4 40 """Number of segments. Always 4 due to rectangular image form. 41 (For compatibility with `Path.count`.)""" 42 self.stroke: int = 0 43 """The border stroke color. Always 0. 44 (For compatibility with `Path.stroke`.)""" 45 self.fill: int = 0 46 """The image fill color. Always 0. 47 (For compatibility with `Path.fill`.)""" 48 self.width: float = 0 49 """The border line width. Always 0. 50 (For compatibility with `Path.width`.)"""
Parameters
- obj: Page object of the image.
Number of segments. Always 4 due to rectangular image form.
(For compatibility with Path.count
.)
52 @cached_property 53 def matrix(self) -> pp.PdfMatrix: 54 """The transformation matrix.""" 55 return self.get_matrix()
The transformation matrix.
57 @cached_property 58 def bbox(self) -> Rectangle: 59 """The bounding box of the image.""" 60 bbox = Rectangle(*self.get_pos()) 61 if self.page.rotation: 62 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 63 return bbox
The bounding box of the image.
65 @cached_property 66 def points(self) -> list[Point]: 67 """ 68 The 4 points of the bounding box. 69 (For compatibility with `Path.points`.) 70 """ 71 points = self.bbox.points 72 if self.page.rotation: 73 points = [Point(p.y, self.page.height - p.x, p.type) for p in points] 74 return points
The 4 points of the bounding box.
(For compatibility with Path.points
.)
76 @cached_property 77 def lines(self) -> list[Line]: 78 """ 79 The 4 lines of the bounding box. 80 (For compatibility with `Path.lines`.) 81 """ 82 p = self.points 83 return [ 84 Line(p[0], p[1], p[1].type, 0), 85 Line(p[1], p[2], p[2].type, 0), 86 Line(p[2], p[3], p[3].type, 0), 87 Line(p[3], p[0], p[0].type, 0), 88 ]
The 4 lines of the bounding box.
(For compatibility with Path.lines
.)
Inherited Members
- pypdfium2._helpers.pageobjects.PdfImage
- SIMPLE_FILTERS
- new
- get_metadata
- get_size
- load_jpeg
- set_bitmap
- get_bitmap
- get_data
- get_filters
- extract
- pypdfium2._helpers.pageobjects.PdfObject
- parent
- get_pos
- get_matrix
- set_matrix
- transform
- pypdfium2.internal.bases.AutoCloseable
- close
24class Structure: 25 """ 26 A PDF/UA ("tagged PDF") contains the structure of content as a tree data 27 structure with similar semantics to HTML. 28 29 This class is a convenience wrapper around [the pdfium structtree methods]( 30 https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h). 31 """ 32 33 def __init__(self, page: "modm_data.pdf.page.Page", element: pp.raw.FPDF_STRUCTELEMENT, parent: "Structure" = None): # noqa: F821 34 self._page = page 35 self._element = element 36 self.parent: Structure = weakref.ref(parent) if parent else None 37 """The parent node.""" 38 39 def _get_string(self, function) -> str: 40 length = function(self._element, 0, 0) 41 clength = ctypes.c_ulong(length) 42 cbuffer = ctypes.create_string_buffer(length) 43 function(self._element, cbuffer, clength) 44 return bytes(cbuffer).decode("utf-16-le", errors="ignore") 45 46 @cached_property 47 def title(self) -> str: 48 """Title `/T`""" 49 return self._get_string(pp.raw.FPDF_StructElement_GetTitle) 50 51 @cached_property 52 def actual_text(self) -> str: 53 """The actual text.""" 54 return self._get_string(pp.raw.FPDF_StructElement_GetActualText) 55 56 @cached_property 57 def alt_text(self) -> str: 58 """Alternate Text""" 59 return self._get_string(pp.raw.FPDF_StructElement_GetAltText) 60 61 @cached_property 62 def type(self) -> str: 63 """Type `/S`""" 64 return self._get_string(pp.raw.FPDF_StructElement_GetType) 65 66 @cached_property 67 def obj_type(self) -> str: 68 """Object Type `/Type`""" 69 return self._get_string(pp.raw.FPDF_StructElement_GetObjType) 70 71 @cached_property 72 def language(self) -> str: 73 """The case-insensitive IETF BCP 47 language code.""" 74 return self._get_string(pp.raw.FPDF_StructElement_GetLang) 75 76 @cached_property 77 def id(self) -> str: 78 """Identifier""" 79 return self._get_string(pp.raw.FPDF_StructElement_GetID) 80 81 @cached_property 82 def marked_ids(self) -> list[int]: 83 """List of marked content identifiers""" 84 ids = [] 85 for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)): 86 if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1: 87 ids.append(mcid) 88 return ids 89 90 @cached_property 91 def attributes(self) -> dict[str, str | bool | float]: 92 """ 93 All attributes of this structure element as a dictionary. 94 95 .. note:: 96 Due to limitations of the pdfium API, attribute arrays cannot be 97 extracted! The values are marked as `[?]` in the dictionary. 98 """ 99 kv = {} 100 for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)): 101 attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex) 102 for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)): 103 # Get the name 104 clength = ctypes.c_ulong(0) 105 cname = ctypes.create_string_buffer(1) # workaround to get length 106 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength) 107 cname = ctypes.create_string_buffer(clength.value) 108 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength) 109 name = cname.raw.decode("utf-8", errors="ignore") 110 111 # Get the type 112 atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname) 113 assert atype != pp.raw.FPDF_OBJECT_UNKNOWN 114 115 # Then get each type individually 116 match atype: 117 case pp.raw.FPDF_OBJECT_BOOLEAN: 118 cbool = ctypes.bool() 119 assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool) 120 kv[name] = cbool.value 121 122 case pp.raw.FPDF_OBJECT_NUMBER: 123 cfloat = ctypes.c_float() 124 assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat) 125 kv[name] = cfloat.value 126 127 case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME: 128 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength) 129 cattrname = ctypes.create_string_buffer(clength.value * 2) 130 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength) 131 kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1] 132 133 # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed? 134 # case pp.raw.FPDF_OBJECT_ARRAY: 135 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength) 136 # cblob = ctypes.create_string_buffer(clength.value) 137 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength) 138 # kv[name] = cblob.raw 139 140 case pp.raw.FPDF_OBJECT_ARRAY: 141 kv[name] = "[?]" 142 143 case _: 144 kv[name] = f"[unknown={atype}?]" 145 return kv 146 147 @cache 148 def child(self, index: int) -> "Structure": 149 """ 150 :param index: 0-index of child. 151 :return: Child structure. 152 """ 153 index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index) 154 return Structure(self._page, index, self) 155 156 @property 157 def children(self) -> list: 158 """All child structures.""" 159 count = pp.raw.FPDF_StructElement_CountChildren(self._element) 160 for ii in range(count): 161 yield self.child(ii) 162 163 def descr(self, indent=0) -> str: 164 """Description including all children via indentation.""" 165 string = " " * indent + repr(self) + "\n" 166 for child in self.children: 167 string += child.descr(indent + 4) 168 return string 169 170 def __repr__(self) -> str: 171 values = [] 172 if self.type: 173 values.append(f"type={self.type}") 174 if self.title: 175 values.append(f"title={self.title}") 176 if self.actual_text: 177 values.append(f"act_text={self.actual_text}") 178 if self.alt_text: 179 values.append(f"alt_text={self.alt_text}") 180 if self.id: 181 values.append(f"id={self.id}") 182 values += [f"mid={i}" for i in self.marked_ids] 183 values += [f"{k}={v}" for k, v in self.attributes.items()] 184 return f"S({','.join(map(str, values))})"
A PDF/UA ("tagged PDF") contains the structure of content as a tree data structure with similar semantics to HTML.
This class is a convenience wrapper around the pdfium structtree methods.
46 @cached_property 47 def title(self) -> str: 48 """Title `/T`""" 49 return self._get_string(pp.raw.FPDF_StructElement_GetTitle)
Title /T
51 @cached_property 52 def actual_text(self) -> str: 53 """The actual text.""" 54 return self._get_string(pp.raw.FPDF_StructElement_GetActualText)
The actual text.
56 @cached_property 57 def alt_text(self) -> str: 58 """Alternate Text""" 59 return self._get_string(pp.raw.FPDF_StructElement_GetAltText)
Alternate Text
61 @cached_property 62 def type(self) -> str: 63 """Type `/S`""" 64 return self._get_string(pp.raw.FPDF_StructElement_GetType)
Type /S
66 @cached_property 67 def obj_type(self) -> str: 68 """Object Type `/Type`""" 69 return self._get_string(pp.raw.FPDF_StructElement_GetObjType)
Object Type /Type
71 @cached_property 72 def language(self) -> str: 73 """The case-insensitive IETF BCP 47 language code.""" 74 return self._get_string(pp.raw.FPDF_StructElement_GetLang)
The case-insensitive IETF BCP 47 language code.
76 @cached_property 77 def id(self) -> str: 78 """Identifier""" 79 return self._get_string(pp.raw.FPDF_StructElement_GetID)
Identifier
81 @cached_property 82 def marked_ids(self) -> list[int]: 83 """List of marked content identifiers""" 84 ids = [] 85 for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)): 86 if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1: 87 ids.append(mcid) 88 return ids
List of marked content identifiers
90 @cached_property 91 def attributes(self) -> dict[str, str | bool | float]: 92 """ 93 All attributes of this structure element as a dictionary. 94 95 .. note:: 96 Due to limitations of the pdfium API, attribute arrays cannot be 97 extracted! The values are marked as `[?]` in the dictionary. 98 """ 99 kv = {} 100 for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)): 101 attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex) 102 for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)): 103 # Get the name 104 clength = ctypes.c_ulong(0) 105 cname = ctypes.create_string_buffer(1) # workaround to get length 106 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength) 107 cname = ctypes.create_string_buffer(clength.value) 108 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength) 109 name = cname.raw.decode("utf-8", errors="ignore") 110 111 # Get the type 112 atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname) 113 assert atype != pp.raw.FPDF_OBJECT_UNKNOWN 114 115 # Then get each type individually 116 match atype: 117 case pp.raw.FPDF_OBJECT_BOOLEAN: 118 cbool = ctypes.bool() 119 assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool) 120 kv[name] = cbool.value 121 122 case pp.raw.FPDF_OBJECT_NUMBER: 123 cfloat = ctypes.c_float() 124 assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat) 125 kv[name] = cfloat.value 126 127 case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME: 128 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength) 129 cattrname = ctypes.create_string_buffer(clength.value * 2) 130 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength) 131 kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1] 132 133 # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed? 134 # case pp.raw.FPDF_OBJECT_ARRAY: 135 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength) 136 # cblob = ctypes.create_string_buffer(clength.value) 137 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength) 138 # kv[name] = cblob.raw 139 140 case pp.raw.FPDF_OBJECT_ARRAY: 141 kv[name] = "[?]" 142 143 case _: 144 kv[name] = f"[unknown={atype}?]" 145 return kv
All attributes of this structure element as a dictionary.
Due to limitations of the pdfium API, attribute arrays cannot be
extracted! The values are marked as [?]
in the dictionary.
147 @cache 148 def child(self, index: int) -> "Structure": 149 """ 150 :param index: 0-index of child. 151 :return: Child structure. 152 """ 153 index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index) 154 return Structure(self._page, index, self)
Parameters
- index: 0-index of child.
Returns
Child structure.
156 @property 157 def children(self) -> list: 158 """All child structures.""" 159 count = pp.raw.FPDF_StructElement_CountChildren(self._element) 160 for ii in range(count): 161 yield self.child(ii)
All child structures.
163 def descr(self, indent=0) -> str: 164 """Description including all children via indentation.""" 165 string = " " * indent + repr(self) + "\n" 166 for child in self.children: 167 string += child.descr(indent + 4) 168 return string
Description including all children via indentation.
51def render_page_pdf(doc, page, new_doc=None, index=0): 52 _, height = page.width, page.height 53 54 if new_doc is None: 55 new_doc = pp.raw.FPDF_CreateNewDocument() 56 # copy page over to new doc 57 assert pp.raw.FPDF_ImportPages(new_doc, doc, str(page.number).encode("ascii"), index) 58 new_page = pp.raw.FPDF_LoadPage(new_doc, index) 59 rotation = page.rotation 60 61 for path in page.paths: 62 p0 = path.points[0] 63 if rotation: 64 obj = pp.raw.FPDFPageObj_CreateNewPath(height - p0.y, p0.x) 65 else: 66 obj = pp.raw.FPDFPageObj_CreateNewPath(p0.x, p0.y) 67 assert pp.raw.FPDFPageObj_SetStrokeColor(obj, 0, 0, 0xFF, 0xC0) 68 assert pp.raw.FPDFPageObj_SetStrokeWidth(obj, 0.25) 69 assert pp.raw.FPDFPageObj_SetLineJoin(obj, pp.raw.FPDF_LINEJOIN_ROUND) 70 assert pp.raw.FPDFPageObj_SetLineCap(obj, pp.raw.FPDF_LINECAP_ROUND) 71 assert pp.raw.FPDFPath_SetDrawMode(obj, 0, True) 72 for point in path.points[1:]: 73 if point.type == path.Type.MOVE: 74 if rotation: 75 assert pp.raw.FPDFPath_MoveTo(obj, height - point.y, point.x) 76 else: 77 assert pp.raw.FPDFPath_MoveTo(obj, point.x, point.y) 78 else: 79 if rotation: 80 assert pp.raw.FPDFPath_LineTo(obj, height - point.y, point.x) 81 else: 82 assert pp.raw.FPDFPath_LineTo(obj, point.x, point.y) 83 pp.raw.FPDFPage_InsertObject(new_page, obj) 84 85 for bbox, _ in page.graphic_clusters(): 86 _rect(new_page, rotation, bbox, width=2, stroke=0x00FFFF) 87 88 for link in page.objlinks: 89 _rect(new_page, rotation, link.bbox, width=0.75, stroke=0x9ACD32) 90 91 for link in page.weblinks: 92 for bbox in link.bboxes: 93 _rect(new_page, rotation, bbox, width=0.75, stroke=0x00FF00) 94 95 for char in page.chars: 96 color = 0x0000FF 97 if char.bbox.width: 98 _rect(new_page, rotation, char.bbox, width=0.5, stroke=0xFF0000) 99 _vline( 100 new_page, 101 rotation, 102 char.bbox.midpoint.x, 103 char.bbox.midpoint.y - 1, 104 char.bbox.midpoint.y + 1, 105 width=0.25, 106 stroke=0xFF0000, 107 ) 108 _hline( 109 new_page, 110 rotation, 111 char.bbox.midpoint.y, 112 char.bbox.midpoint.x - 1, 113 char.bbox.midpoint.x + 1, 114 width=0.25, 115 stroke=0xFF0000, 116 ) 117 color = 0x000000 118 _vline(new_page, rotation, char.origin.x, char.origin.y - 1, char.origin.y + 1, width=0.25, stroke=color) 119 _hline(new_page, rotation, char.origin.y, char.origin.x - 1, char.origin.x + 1, width=0.25, stroke=color) 120 121 assert pp.raw.FPDFPage_GenerateContent(new_page) 122 pp.raw.FPDF_ClosePage(new_page) 123 return new_doc