modm_data.pdf
PDF Content Accessors
This module extends the pypdfium2 Python API with low-level accessors for characters and graphics. Note that these modules support read-only access to PDFs, since a lot of caching is used to speed up commonly accessed properties.
This module only contains formatting independent PDF access which is then
specialized in the vendor-specific modm_data.pdf2html
modules.
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF Content Accessors 6 7This module extends the pypdfium2 Python API with low-level accessors for 8characters and graphics. Note that these modules support read-only access to 9PDFs, since a lot of caching is used to speed up commonly accessed properties. 10 11This module only contains formatting independent PDF access which is then 12specialized in the vendor-specific `modm_data.pdf2html` modules. 13""" 14 15from .document import Document 16from .page import Page 17from .character import Character 18from .link import ObjLink, WebLink 19from .path import Path 20from .image import Image 21from .render import annotate_debug_info 22from .structure import Structure 23 24__all__ = [ 25 "annotate_debug_info", 26 "Document", 27 "Page", 28 "Character", 29 "Path", 30 "Image", 31 "ObjLink", 32 "WebLink", 33 "Structure", 34]
52def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument: 53 """ 54 Copies each page into a new or existing PDF document and overlays the internal information on top of the content. 55 - Renders the bounding boxes in RED and origins in BLACK of all characters. 56 - Renders the bounding boxes of web links in BLUE GREEN. 57 - Renders the bounding boxes of object links in YELLOW GREEN. 58 - Renders all graphics paths in BLUE. 59 - Renders the bounding boxes of computed graphics clusters in CYAN. 60 61 :param page: The page to be annotated. 62 :param new_doc: The PDF document to copy the page to. If not provided, a new document is created. 63 :param index: The index of the page in the new document. 64 :return: The new document with the annotated page added. 65 """ 66 _, height = page.width, page.height 67 68 if new_doc is None: 69 new_doc = pp.raw.FPDF_CreateNewDocument() 70 # copy page over to new doc 71 assert pp.raw.FPDF_ImportPages(new_doc, page.pdf, str(page.number).encode("ascii"), index) 72 new_page = pp.raw.FPDF_LoadPage(new_doc, index) 73 rotation = page.rotation 74 75 for path in page.paths: 76 p0 = path.points[0] 77 if rotation: 78 obj = pp.raw.FPDFPageObj_CreateNewPath(height - p0.y, p0.x) 79 else: 80 obj = pp.raw.FPDFPageObj_CreateNewPath(p0.x, p0.y) 81 assert pp.raw.FPDFPageObj_SetStrokeColor(obj, 0, 0, 0xFF, 0xC0) 82 assert pp.raw.FPDFPageObj_SetStrokeWidth(obj, 0.25) 83 assert pp.raw.FPDFPageObj_SetLineJoin(obj, pp.raw.FPDF_LINEJOIN_ROUND) 84 assert pp.raw.FPDFPageObj_SetLineCap(obj, pp.raw.FPDF_LINECAP_ROUND) 85 assert pp.raw.FPDFPath_SetDrawMode(obj, 0, True) 86 for point in path.points[1:]: 87 if point.type == path.Type.MOVE: 88 if rotation: 89 assert pp.raw.FPDFPath_MoveTo(obj, height - point.y, point.x) 90 else: 91 assert pp.raw.FPDFPath_MoveTo(obj, point.x, point.y) 92 else: 93 if rotation: 94 assert pp.raw.FPDFPath_LineTo(obj, height - point.y, point.x) 95 else: 96 assert pp.raw.FPDFPath_LineTo(obj, point.x, point.y) 97 pp.raw.FPDFPage_InsertObject(new_page, obj) 98 99 for bbox, _ in page.graphic_clusters(): 100 _rect(new_page, rotation, bbox, width=2, stroke=0x00FFFF) 101 102 for link in page.objlinks: 103 _rect(new_page, rotation, link.bbox, width=0.75, stroke=0x9ACD32) 104 105 for link in page.weblinks: 106 for bbox in link.bboxes: 107 _rect(new_page, rotation, bbox, width=0.75, stroke=0x00FF00) 108 109 for char in page.chars: 110 color = 0x0000FF 111 if char.bbox.width: 112 _rect(new_page, rotation, char.bbox, width=0.5, stroke=0xFF0000) 113 _vline( 114 new_page, 115 rotation, 116 char.bbox.midpoint.x, 117 char.bbox.midpoint.y - 1, 118 char.bbox.midpoint.y + 1, 119 width=0.25, 120 stroke=0xFF0000, 121 ) 122 _hline( 123 new_page, 124 rotation, 125 char.bbox.midpoint.y, 126 char.bbox.midpoint.x - 1, 127 char.bbox.midpoint.x + 1, 128 width=0.25, 129 stroke=0xFF0000, 130 ) 131 color = 0x000000 132 _vline(new_page, rotation, char.origin.x, char.origin.y - 1, char.origin.y + 1, width=0.25, stroke=color) 133 _hline(new_page, rotation, char.origin.y, char.origin.x - 1, char.origin.x + 1, width=0.25, stroke=color) 134 135 assert pp.raw.FPDFPage_GenerateContent(new_page) 136 pp.raw.FPDF_ClosePage(new_page) 137 return new_doc
Copies each page into a new or existing PDF document and overlays the internal information on top of the content.
- Renders the bounding boxes in RED and origins in BLACK of all characters.
- Renders the bounding boxes of web links in BLUE GREEN.
- Renders the bounding boxes of object links in YELLOW GREEN.
- Renders all graphics paths in BLUE.
- Renders the bounding boxes of computed graphics clusters in CYAN.
Parameters
- page: The page to be annotated.
- new_doc: The PDF document to copy the page to. If not provided, a new document is created.
- index: The index of the page in the new document.
Returns
The new document with the annotated page added.
31class Document(pp.PdfDocument): 32 """ 33 The PDF document is the root of the entire data structure and provides 34 access to PDF metadata, the table of contents, as well as individual 35 pages. 36 37 You should extend from this class for a specific vendor to provide the 38 correct page class from `page()` function. 39 40 This class is a convenience wrapper with caching around the high-level APIs 41 of pypdfium. 42 """ 43 44 def __init__(self, path: Path, autoclose: bool = False): 45 """ 46 :param path: Path to the PDF to open. 47 """ 48 path = Path(path) 49 self.name: str = path.stem 50 """Stem of the document file name""" 51 super().__init__(path, autoclose=autoclose) 52 self._path = path 53 self._bbox_cache = defaultdict(dict) 54 _LOGGER.debug(f"Loading: {path}") 55 56 @cached_property 57 def metadata(self) -> dict[str, str]: 58 """The PDF metadata dictionary.""" 59 return self.get_metadata_dict() 60 61 @property 62 def destinations(self) -> Iterator[tuple[int, str]]: 63 """Yields (page 0-index, named destination) of the whole document.""" 64 for ii in range(pp.raw.FPDF_CountNamedDests(self)): 65 length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0) 66 clength = ctypes.c_long(length) 67 cbuffer = ctypes.create_string_buffer(length * 2) 68 dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength) 69 name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00") 70 page = pp.raw.FPDFDest_GetDestPageIndex(self, dest) 71 yield (page, name) 72 73 @cached_property 74 def toc(self) -> list[pp.PdfOutlineItem]: 75 """ 76 The table of content as a sorted list of outline items ensuring item has 77 a page index by reusing the last one. 78 """ 79 tocs = set() 80 # Sometimes the TOC contains duplicates so we must use a set 81 last_page_index = 0 82 for toc in self.get_toc(): 83 outline = _OutlineItem( 84 toc.level, 85 toc.title, 86 toc.is_closed, 87 toc.n_kids, 88 toc.page_index or last_page_index, 89 toc.view_mode, 90 toc.view_pos, 91 ) 92 last_page_index = toc.page_index or last_page_index 93 tocs.add(outline) 94 return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title))) 95 96 @cached_property 97 def identifier_permanent(self) -> str: 98 """The permanent file identifier.""" 99 return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT) 100 101 @cached_property 102 def identifier_changing(self) -> str: 103 """The changing file identifier.""" 104 return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING) 105 106 @cached_property 107 def page_count(self) -> int: 108 """The number of pages in the document.""" 109 return pp.raw.FPDF_GetPageCount(self) 110 111 @cache 112 def page(self, index: int) -> Page: 113 """ 114 :param index: 0-indexed page number. 115 :return: the page object for the index. 116 """ 117 assert index < self.page_count 118 return Page(self, index) 119 120 def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]: 121 """ 122 :param numbers: an iterable range of page numbers (0-indexed!). 123 If `None`, then the whole page range is used. 124 :return: yields each page in the range. 125 """ 126 if numbers is None: 127 numbers = range(self.page_count) 128 for ii in numbers: 129 if 0 <= ii < self.page_count: 130 yield self.page(ii) 131 132 def __repr__(self) -> str: 133 return f"Doc({self.name})"
The PDF document is the root of the entire data structure and provides access to PDF metadata, the table of contents, as well as individual pages.
You should extend from this class for a specific vendor to provide the
correct page class from page()
function.
This class is a convenience wrapper with caching around the high-level APIs of pypdfium.
44 def __init__(self, path: Path, autoclose: bool = False): 45 """ 46 :param path: Path to the PDF to open. 47 """ 48 path = Path(path) 49 self.name: str = path.stem 50 """Stem of the document file name""" 51 super().__init__(path, autoclose=autoclose) 52 self._path = path 53 self._bbox_cache = defaultdict(dict) 54 _LOGGER.debug(f"Loading: {path}")
Parameters
- path: Path to the PDF to open.
56 @cached_property 57 def metadata(self) -> dict[str, str]: 58 """The PDF metadata dictionary.""" 59 return self.get_metadata_dict()
The PDF metadata dictionary.
61 @property 62 def destinations(self) -> Iterator[tuple[int, str]]: 63 """Yields (page 0-index, named destination) of the whole document.""" 64 for ii in range(pp.raw.FPDF_CountNamedDests(self)): 65 length = pp.raw.FPDF_GetNamedDest(self, ii, 0, 0) 66 clength = ctypes.c_long(length) 67 cbuffer = ctypes.create_string_buffer(length * 2) 68 dest = pp.raw.FPDF_GetNamedDest(self, ii, cbuffer, clength) 69 name = cbuffer.raw[: clength.value * 2].decode("utf-16-le").rstrip("\x00") 70 page = pp.raw.FPDFDest_GetDestPageIndex(self, dest) 71 yield (page, name)
Yields (page 0-index, named destination) of the whole document.
73 @cached_property 74 def toc(self) -> list[pp.PdfOutlineItem]: 75 """ 76 The table of content as a sorted list of outline items ensuring item has 77 a page index by reusing the last one. 78 """ 79 tocs = set() 80 # Sometimes the TOC contains duplicates so we must use a set 81 last_page_index = 0 82 for toc in self.get_toc(): 83 outline = _OutlineItem( 84 toc.level, 85 toc.title, 86 toc.is_closed, 87 toc.n_kids, 88 toc.page_index or last_page_index, 89 toc.view_mode, 90 toc.view_pos, 91 ) 92 last_page_index = toc.page_index or last_page_index 93 tocs.add(outline) 94 return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
The table of content as a sorted list of outline items ensuring item has a page index by reusing the last one.
96 @cached_property 97 def identifier_permanent(self) -> str: 98 """The permanent file identifier.""" 99 return self.get_identifier(pp.raw.FILEIDTYPE_PERMANENT)
The permanent file identifier.
101 @cached_property 102 def identifier_changing(self) -> str: 103 """The changing file identifier.""" 104 return self.get_identifier(pp.raw.FILEIDTYPE_CHANGING)
The changing file identifier.
106 @cached_property 107 def page_count(self) -> int: 108 """The number of pages in the document.""" 109 return pp.raw.FPDF_GetPageCount(self)
The number of pages in the document.
111 @cache 112 def page(self, index: int) -> Page: 113 """ 114 :param index: 0-indexed page number. 115 :return: the page object for the index. 116 """ 117 assert index < self.page_count 118 return Page(self, index)
Parameters
- index: 0-indexed page number.
Returns
the page object for the index.
120 def pages(self, numbers: Iterable[int] = None) -> Iterator[Page]: 121 """ 122 :param numbers: an iterable range of page numbers (0-indexed!). 123 If `None`, then the whole page range is used. 124 :return: yields each page in the range. 125 """ 126 if numbers is None: 127 numbers = range(self.page_count) 128 for ii in numbers: 129 if 0 <= ii < self.page_count: 130 yield self.page(ii)
Parameters
- numbers: an iterable range of page numbers (0-indexed!).
If
None
, then the whole page range is used.
Returns
yields each page in the range.
Inherited Members
- pypdfium2._helpers.document.PdfDocument
- formenv
- parent
- new
- init_forms
- get_formtype
- get_pagemode
- is_tagged
- save
- get_identifier
- get_version
- get_metadata_value
- METADATA_KEYS
- get_metadata_dict
- count_attachments
- get_attachment
- new_attachment
- del_attachment
- get_page
- new_page
- del_page
- import_pages
- get_page_size
- get_page_label
- page_as_xobject
- get_toc
- render
- pypdfium2.internal.bases.AutoCloseable
- close
24class Page(pp.PdfPage): 25 """ 26 This class provides low-level access to graphics and characters of the page. 27 It also fixes missing bounding boxes for rotates characters on page load, 28 as well as allow searching for characters in an area instead of just text. 29 """ 30 31 def __init__(self, document: "modm_data.pdf.Document", index: int): # noqa: F821 32 """ 33 :param document: a PDF document. 34 :param index: 0-index page number. 35 """ 36 self.index = index 37 """0-index page number.""" 38 self.number = index + 1 39 """1-index page number.""" 40 41 super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv) 42 self._links = None 43 self._weblinks = None 44 self._linked = False 45 46 _LOGGER.debug(f"Loading: {index}") 47 48 self._text = self.get_textpage() 49 self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) 50 self._structtree = pp.raw.FPDF_StructTree_GetForPage(self) 51 # close them in reverse order 52 weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree) 53 weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage) 54 55 self._fix_bboxes() 56 57 @cached_property 58 def label(self) -> str: 59 """The page label.""" 60 return self.pdf.get_page_label(self.index) 61 62 @cached_property 63 def width(self) -> float: 64 """The page width.""" 65 return self.get_width() 66 67 @cached_property 68 def height(self) -> float: 69 """The page height.""" 70 return self.get_height() 71 72 @cached_property 73 def rotation(self) -> int: 74 """The page rotation in degrees.""" 75 return self.get_rotation() 76 77 @cached_property 78 def bbox(self) -> Rectangle: 79 """The page bounding box.""" 80 return Rectangle(*self.get_bbox()) 81 82 @cached_property 83 def char_count(self) -> int: 84 """The total count of characters.""" 85 return self._text.count_chars() 86 87 @cache 88 def char(self, index: int) -> Character: 89 """:return: The character at the 0-index.""" 90 return Character(self, index) 91 92 @property 93 def chars(self) -> Iterator[Character]: 94 """Yields all characters.""" 95 for ii in range(self.char_count): 96 yield self.char(ii) 97 98 @cached_property 99 def objlinks(self) -> list[ObjLink]: 100 """All object links.""" 101 links = [] 102 pos = ctypes.c_int(0) 103 link = pp.raw.FPDF_LINK() 104 while pp.raw.FPDFLink_Enumerate(self, pos, link): 105 links.append(ObjLink(self, link)) 106 return links 107 108 @cached_property 109 def weblinks(self) -> list[WebLink]: 110 """All web links.""" 111 links = [] 112 for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)): 113 links.append(WebLink(self, ii)) 114 return links 115 116 def chars_in_area(self, area: Rectangle) -> list[Character]: 117 """ 118 :param area: area to search for character in. 119 :return: All characters found in the area. 120 """ 121 found = [] 122 # We perform binary searches of the lower and upper y-positions first 123 # lines are ordered by y-position 124 ypositions = list(self._charlines.keys()) 125 y_bottom = bisect_left(ypositions, area.bottom) 126 y_top = bisect_right(ypositions, area.top, lo=y_bottom) 127 128 # Then for every line we do another binary search for left and right 129 for ypos in ypositions[y_bottom:y_top]: 130 chars = self._charlines[ypos] 131 x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x) 132 x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x) 133 # Finally we add all these characters 134 found.extend(chars[x_left:x_right]) 135 return found 136 137 def text_in_area(self, area: Rectangle) -> str: 138 """ 139 :param area: area to search for text in. 140 :return: Only the text found in the area. 141 """ 142 return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top) 143 144 @property 145 def structures(self) -> Iterator[Structure]: 146 """The PDF/UA tags.""" 147 count = pp.raw.FPDF_StructTree_CountChildren(self._structtree) 148 for ii in range(count): 149 child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii) 150 yield Structure(self, child) 151 152 def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]: 153 """ 154 Searches for a match string as whole, consecutive words and yields the 155 characters. 156 157 :param string: The search string. 158 :param case_sensitive: Ignore case if false. 159 :return: yields the characters found. 160 """ 161 searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True) 162 while idx := searcher.get_next(): 163 chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])] 164 yield chars 165 166 @cached_property 167 def paths(self) -> list[Path]: 168 """All paths.""" 169 return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])] 170 171 @cached_property 172 def images(self) -> list[Image]: 173 """All images.""" 174 return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])] 175 176 def graphic_clusters( 177 self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None 178 ) -> list[tuple[Rectangle, list[Path]]]: 179 if absolute_tolerance is None: 180 absolute_tolerance = min(self.width, self.height) * 0.01 181 182 # First collect all vertical regions 183 filtered_paths = [] 184 for path in self.paths: 185 if predicate is None or predicate(path): 186 filtered_paths.append(path) 187 for image in self.images: 188 if predicate is None or predicate(image): 189 filtered_paths.append(image) 190 191 regions = [] 192 for path in sorted(filtered_paths, key=lambda path: path.bbox.y): 193 for reg in regions: 194 if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance): 195 # They overlap, so merge them 196 reg.v0 = min(reg.v0, path.bbox.bottom) 197 reg.v1 = max(reg.v1, path.bbox.top) 198 reg.objs.append(path) 199 break 200 else: 201 regions.append(Region(path.bbox.bottom, path.bbox.top, path)) 202 203 # Now collect horizontal region inside each vertical region 204 for yreg in regions: 205 for path in sorted(filtered_paths, key=lambda path: path.bbox.x): 206 # check if horizontal line is contained in vregion 207 if yreg.contains(path.bbox.y, absolute_tolerance): 208 for xreg in yreg.subregions: 209 if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance): 210 # They overlap so merge them 211 xreg.v0 = min(xreg.v0, path.bbox.left) 212 xreg.v1 = max(xreg.v1, path.bbox.right) 213 xreg.objs.append(path) 214 break 215 else: 216 yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path)) 217 218 clusters = [] 219 for yreg in regions: 220 for xreg in yreg.subregions: 221 if len(yreg.subregions) > 1: 222 # Strip down the height again for subregions 223 y0, y1 = 1e9, 0 224 for path in xreg.objs: 225 y0 = min(y0, path.bbox.bottom) 226 y1 = max(y1, path.bbox.top) 227 else: 228 y0, y1 = yreg.v0, yreg.v1 229 bbox = Rectangle(xreg.v0, y0, xreg.v1, y1) 230 clusters.append((bbox, xreg.objs)) 231 232 return sorted(clusters, key=lambda c: (-c[0].y, c[0].x)) 233 234 def _link_characters(self): 235 if self._linked: 236 return 237 # The in-document links only gives us rectangles and we must find the 238 # linked chars ourselves 239 for link in self.objlinks: 240 for char in self.chars_in_area(link.bbox): 241 char.objlink = link 242 # The weblinks give you an explicit char range, very convenient 243 for link in self.weblinks: 244 for ii in range(*link.range): 245 self.char(ii).weblink = link 246 self._linked = True 247 248 @cached_property 249 def _charlines(self): 250 charlines = defaultdict(list) 251 for char in self.chars: 252 charlines[round(char.bbox.midpoint.y, 1)].append(char) 253 254 orderedchars = OrderedDict.fromkeys(sorted(charlines)) 255 for ypos, chars in charlines.items(): 256 orderedchars[ypos] = sorted(chars, key=lambda c: c.bbox.midpoint.x) 257 258 return orderedchars 259 260 def _fix_bboxes(self): 261 def _key(char): 262 height = round(char.tbbox.height, 1) 263 width = round(char.tbbox.width, 1) 264 return f"{char.font} {char.unicode} {height} {width}" 265 266 fix_chars = [] 267 for char in self.chars: 268 if not char._bbox.width or not char._bbox.height: 269 if char._rotation: 270 fix_chars.append(char) 271 elif char.unicode not in {0xA, 0xD}: 272 fix_chars.append(char) 273 elif char.unicode not in {0xA, 0xD} and not char._rotation and _key(char) not in self.pdf._bbox_cache: 274 bbox = char._bbox.translated(-char.origin).rotated(self.rotation + char._rotation) 275 self.pdf._bbox_cache[_key(char)] = (char, bbox) 276 # print("->", _key(char), char.descr(), char.height, char.rotation, char._rotation, self.rotation) 277 for char in fix_chars: 278 bbox = self.pdf._bbox_cache.get(_key(char)) 279 if bbox is not None: 280 # print("<-", char.descr(), char._rotation, char.rotation, char.height) 281 _, bbox = bbox 282 bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin) 283 char._bbox = bbox 284 elif char.unicode not in {0x20, 0xA, 0xD}: 285 _LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
This class provides low-level access to graphics and characters of the page. It also fixes missing bounding boxes for rotates characters on page load, as well as allow searching for characters in an area instead of just text.
31 def __init__(self, document: "modm_data.pdf.Document", index: int): # noqa: F821 32 """ 33 :param document: a PDF document. 34 :param index: 0-index page number. 35 """ 36 self.index = index 37 """0-index page number.""" 38 self.number = index + 1 39 """1-index page number.""" 40 41 super().__init__(pp.raw.FPDF_LoadPage(document, index), document, document.formenv) 42 self._links = None 43 self._weblinks = None 44 self._linked = False 45 46 _LOGGER.debug(f"Loading: {index}") 47 48 self._text = self.get_textpage() 49 self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) 50 self._structtree = pp.raw.FPDF_StructTree_GetForPage(self) 51 # close them in reverse order 52 weakref.finalize(self, pp.raw.FPDF_StructTree_Close, self._structtree) 53 weakref.finalize(self, pp.raw.FPDFLink_CloseWebLinks, self._linkpage) 54 55 self._fix_bboxes()
Parameters
- document: a PDF document.
- index: 0-index page number.
57 @cached_property 58 def label(self) -> str: 59 """The page label.""" 60 return self.pdf.get_page_label(self.index)
The page label.
62 @cached_property 63 def width(self) -> float: 64 """The page width.""" 65 return self.get_width()
The page width.
67 @cached_property 68 def height(self) -> float: 69 """The page height.""" 70 return self.get_height()
The page height.
72 @cached_property 73 def rotation(self) -> int: 74 """The page rotation in degrees.""" 75 return self.get_rotation()
The page rotation in degrees.
77 @cached_property 78 def bbox(self) -> Rectangle: 79 """The page bounding box.""" 80 return Rectangle(*self.get_bbox())
The page bounding box.
82 @cached_property 83 def char_count(self) -> int: 84 """The total count of characters.""" 85 return self._text.count_chars()
The total count of characters.
87 @cache 88 def char(self, index: int) -> Character: 89 """:return: The character at the 0-index.""" 90 return Character(self, index)
Returns
The character at the 0-index.
92 @property 93 def chars(self) -> Iterator[Character]: 94 """Yields all characters.""" 95 for ii in range(self.char_count): 96 yield self.char(ii)
Yields all characters.
98 @cached_property 99 def objlinks(self) -> list[ObjLink]: 100 """All object links.""" 101 links = [] 102 pos = ctypes.c_int(0) 103 link = pp.raw.FPDF_LINK() 104 while pp.raw.FPDFLink_Enumerate(self, pos, link): 105 links.append(ObjLink(self, link)) 106 return links
All object links.
108 @cached_property 109 def weblinks(self) -> list[WebLink]: 110 """All web links.""" 111 links = [] 112 for ii in range(pp.raw.FPDFLink_CountWebLinks(self._linkpage)): 113 links.append(WebLink(self, ii)) 114 return links
All web links.
116 def chars_in_area(self, area: Rectangle) -> list[Character]: 117 """ 118 :param area: area to search for character in. 119 :return: All characters found in the area. 120 """ 121 found = [] 122 # We perform binary searches of the lower and upper y-positions first 123 # lines are ordered by y-position 124 ypositions = list(self._charlines.keys()) 125 y_bottom = bisect_left(ypositions, area.bottom) 126 y_top = bisect_right(ypositions, area.top, lo=y_bottom) 127 128 # Then for every line we do another binary search for left and right 129 for ypos in ypositions[y_bottom:y_top]: 130 chars = self._charlines[ypos] 131 x_left = bisect_left(chars, area.left, key=lambda c: c.bbox.midpoint.x) 132 x_right = bisect_right(chars, area.right, lo=x_left, key=lambda c: c.bbox.midpoint.x) 133 # Finally we add all these characters 134 found.extend(chars[x_left:x_right]) 135 return found
Parameters
- area: area to search for character in.
Returns
All characters found in the area.
137 def text_in_area(self, area: Rectangle) -> str: 138 """ 139 :param area: area to search for text in. 140 :return: Only the text found in the area. 141 """ 142 return self._text.get_text_bounded(area.left, area.bottom, area.right, area.top)
Parameters
- area: area to search for text in.
Returns
Only the text found in the area.
144 @property 145 def structures(self) -> Iterator[Structure]: 146 """The PDF/UA tags.""" 147 count = pp.raw.FPDF_StructTree_CountChildren(self._structtree) 148 for ii in range(count): 149 child = pp.raw.FPDF_StructTree_GetChildAtIndex(self._structtree, ii) 150 yield Structure(self, child)
The PDF/UA tags.
152 def find(self, string: str, case_sensitive: bool = True) -> Iterator[Character]: 153 """ 154 Searches for a match string as whole, consecutive words and yields the 155 characters. 156 157 :param string: The search string. 158 :param case_sensitive: Ignore case if false. 159 :return: yields the characters found. 160 """ 161 searcher = self._text.search(string, match_case=case_sensitive, match_whole_word=True, consecutive=True) 162 while idx := searcher.get_next(): 163 chars = [self.char(ii) for ii in range(idx[0], idx[0] + idx[1])] 164 yield chars
Searches for a match string as whole, consecutive words and yields the characters.
Parameters
- string: The search string.
- case_sensitive: Ignore case if false.
Returns
yields the characters found.
166 @cached_property 167 def paths(self) -> list[Path]: 168 """All paths.""" 169 return [Path(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_PATH])]
All paths.
171 @cached_property 172 def images(self) -> list[Image]: 173 """All images.""" 174 return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
All images.
176 def graphic_clusters( 177 self, predicate: Callable[[Path | Image], bool] = None, absolute_tolerance: float = None 178 ) -> list[tuple[Rectangle, list[Path]]]: 179 if absolute_tolerance is None: 180 absolute_tolerance = min(self.width, self.height) * 0.01 181 182 # First collect all vertical regions 183 filtered_paths = [] 184 for path in self.paths: 185 if predicate is None or predicate(path): 186 filtered_paths.append(path) 187 for image in self.images: 188 if predicate is None or predicate(image): 189 filtered_paths.append(image) 190 191 regions = [] 192 for path in sorted(filtered_paths, key=lambda path: path.bbox.y): 193 for reg in regions: 194 if reg.overlaps(path.bbox.bottom, path.bbox.top, absolute_tolerance): 195 # They overlap, so merge them 196 reg.v0 = min(reg.v0, path.bbox.bottom) 197 reg.v1 = max(reg.v1, path.bbox.top) 198 reg.objs.append(path) 199 break 200 else: 201 regions.append(Region(path.bbox.bottom, path.bbox.top, path)) 202 203 # Now collect horizontal region inside each vertical region 204 for yreg in regions: 205 for path in sorted(filtered_paths, key=lambda path: path.bbox.x): 206 # check if horizontal line is contained in vregion 207 if yreg.contains(path.bbox.y, absolute_tolerance): 208 for xreg in yreg.subregions: 209 if xreg.overlaps(path.bbox.left, path.bbox.right, absolute_tolerance): 210 # They overlap so merge them 211 xreg.v0 = min(xreg.v0, path.bbox.left) 212 xreg.v1 = max(xreg.v1, path.bbox.right) 213 xreg.objs.append(path) 214 break 215 else: 216 yreg.subregions.append(Region(path.bbox.left, path.bbox.right, path)) 217 218 clusters = [] 219 for yreg in regions: 220 for xreg in yreg.subregions: 221 if len(yreg.subregions) > 1: 222 # Strip down the height again for subregions 223 y0, y1 = 1e9, 0 224 for path in xreg.objs: 225 y0 = min(y0, path.bbox.bottom) 226 y1 = max(y1, path.bbox.top) 227 else: 228 y0, y1 = yreg.v0, yreg.v1 229 bbox = Rectangle(xreg.v0, y0, xreg.v1, y1) 230 clusters.append((bbox, xreg.objs)) 231 232 return sorted(clusters, key=lambda c: (-c[0].y, c[0].x))
Inherited Members
- pypdfium2._helpers.page.PdfPage
- parent
- get_width
- get_height
- get_size
- get_rotation
- set_rotation
- get_mediabox
- set_mediabox
- get_cropbox
- set_cropbox
- get_bleedbox
- set_bleedbox
- get_trimbox
- set_trimbox
- get_artbox
- set_artbox
- get_bbox
- get_textpage
- insert_obj
- remove_obj
- gen_content
- get_objects
- render
- pypdfium2.internal.bases.AutoCloseable
- close
13class Character: 14 """ 15 Each character on the PDF page is represented by a character object, 16 describing exactly where and how to render the associated glyph. 17 18 While there are font flags, PDF files typically use entirely different fonts 19 to render normal, bold, and italic characters. 20 21 The character's loose bounding box may not always be available, since it 22 must be explicitly provided by the font. The tight bounding box is only 23 available as long as the glyph is renderable, so a space character may have 24 a loose, but not a tight bounding box, or none at all. 25 """ 26 27 class RenderMode(Enum): 28 """Tells the PDF viewer how to render this character glyph.""" 29 30 UNKNOWN = -1 31 FILL = 0 32 STROKE = 1 33 FILL_STROKE = 2 34 INVISIBLE = 3 35 FILL_CLIP = 4 36 STROKE_CLIP = 5 37 FILL_STROKE_CLIP = 6 38 CLIP = 7 39 40 def __init__(self, page: "modm_data.pdf.page.Page", index: int): # noqa: F821 41 """ 42 :param page: The page containing the character. 43 :param index: The index of the character. 44 """ 45 self._page = page 46 self._text = page._text 47 self._index = index 48 self._font = None 49 self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index))) 50 51 self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index) 52 """The unicode value of the character.""" 53 self.objlink: "modm_data.pdf.link.ObjLink" = None # noqa: F821 54 """The object link of this character or `None`""" 55 self.weblink: "modm_data.pdf.link.WebLink" = None # noqa: F821 56 """The web link of this character or `None`""" 57 58 bbox = Rectangle(*self._text.get_charbox(self._index, loose=True)) 59 if self._page.rotation: 60 bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 61 self._bbox = bbox 62 63 def _font_flags(self) -> tuple[str, int]: 64 if self._font is None: 65 font = ctypes.create_string_buffer(255) 66 flags = ctypes.c_int() 67 pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags) 68 self._font = (font.value.decode("utf-8"), flags.value) 69 return self._font 70 71 @property 72 def char(self) -> str: 73 """The printable string of the unicode value.""" 74 char = chr(self.unicode) 75 return char if char.isprintable() else "" 76 77 @cached_property 78 def origin(self) -> Point: 79 """The origin of the character.""" 80 x, y = ctypes.c_double(), ctypes.c_double() 81 assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y) 82 if self._page.rotation: 83 return Point(y.value, self._page.height - x.value) 84 return Point(x.value, y.value) 85 86 @cached_property 87 def width(self) -> float: 88 """The width of the character's bounding box.""" 89 if self.rotation: 90 return self.bbox.height 91 return self.bbox.width 92 93 @cached_property 94 def height(self) -> float: 95 """The height of the character's bounding box.""" 96 if self.rotation: 97 return self.bbox.width 98 return self.bbox.height 99 100 @cached_property 101 def tbbox(self) -> Rectangle: 102 """The tight bounding box of the character.""" 103 tbbox = Rectangle(*self._text.get_charbox(self._index)) 104 if self._page.rotation: 105 tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x) 106 return tbbox 107 108 @property 109 def bbox(self) -> Rectangle: 110 """ 111 The loose bounding box of the character. 112 .. note:: 113 If the loose bounding box is not available, the tight bounding box 114 is used instead. 115 """ 116 if not self._bbox.width or not self._bbox.height: 117 return self.tbbox 118 return self._bbox 119 120 @cached_property 121 def twidth(self) -> float: 122 """The width of the character's tight bounding box.""" 123 return self.tbbox.width 124 125 @cached_property 126 def theight(self) -> float: 127 """The height of the character's tight bounding box.""" 128 return self.tbbox.height 129 130 @cached_property 131 def render_mode(self) -> RenderMode: 132 """The render mode of the character.""" 133 return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index)) 134 135 @cached_property 136 def rotation(self) -> int: 137 """The rotation of the character in degrees modulo 360.""" 138 # Special case for vertical text in rotated pages 139 if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}: 140 return 90 141 if self._page.rotation and self._rotation: 142 return (self._page.rotation + self._rotation) % 360 143 return self._rotation 144 145 @cached_property 146 def size(self) -> float: 147 """The font size of the character.""" 148 return pp.raw.FPDFText_GetFontSize(self._text, self._index) 149 150 @cached_property 151 def weight(self) -> int: 152 """The font weight of the character.""" 153 return pp.raw.FPDFText_GetFontWeight(self._text, self._index) 154 155 @cached_property 156 def fill(self) -> int: 157 """The fill color of the character.""" 158 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 159 pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a) 160 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 161 162 @cached_property 163 def stroke(self) -> int: 164 """The stroke color of the character.""" 165 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 166 pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a) 167 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 168 169 @cached_property 170 def font(self) -> str: 171 """The font name of the character.""" 172 return self._font_flags()[0] 173 174 @cached_property 175 def flags(self) -> int: 176 """The font flags of the character.""" 177 return self._font_flags()[1] 178 179 def descr(self) -> str: 180 """Human-readable description of the character for debugging.""" 181 char = chr(self.unicode) 182 if not char.isprintable(): 183 char = hex(self.unicode) 184 return ( 185 f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " 186 f"{self.render_mode}, {self.font}, {hex(self.flags)}, " 187 f"{self.fill}, {self.stroke}, {repr(self.bbox)})" 188 ) 189 190 def __str__(self) -> str: 191 return self.char 192 193 def __repr__(self) -> str: 194 char = chr(self.unicode) 195 escape = {0xA: "\\n", 0xD: "\\r", 0x9: "\\t", 0x20: "␣"} 196 char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode)) 197 return char
Each character on the PDF page is represented by a character object, describing exactly where and how to render the associated glyph.
While there are font flags, PDF files typically use entirely different fonts to render normal, bold, and italic characters.
The character's loose bounding box may not always be available, since it must be explicitly provided by the font. The tight bounding box is only available as long as the glyph is renderable, so a space character may have a loose, but not a tight bounding box, or none at all.
40 def __init__(self, page: "modm_data.pdf.page.Page", index: int): # noqa: F821 41 """ 42 :param page: The page containing the character. 43 :param index: The index of the character. 44 """ 45 self._page = page 46 self._text = page._text 47 self._index = index 48 self._font = None 49 self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index))) 50 51 self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index) 52 """The unicode value of the character.""" 53 self.objlink: "modm_data.pdf.link.ObjLink" = None # noqa: F821 54 """The object link of this character or `None`""" 55 self.weblink: "modm_data.pdf.link.WebLink" = None # noqa: F821 56 """The web link of this character or `None`""" 57 58 bbox = Rectangle(*self._text.get_charbox(self._index, loose=True)) 59 if self._page.rotation: 60 bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 61 self._bbox = bbox
Parameters
- page: The page containing the character.
- index: The index of the character.
71 @property 72 def char(self) -> str: 73 """The printable string of the unicode value.""" 74 char = chr(self.unicode) 75 return char if char.isprintable() else ""
The printable string of the unicode value.
77 @cached_property 78 def origin(self) -> Point: 79 """The origin of the character.""" 80 x, y = ctypes.c_double(), ctypes.c_double() 81 assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y) 82 if self._page.rotation: 83 return Point(y.value, self._page.height - x.value) 84 return Point(x.value, y.value)
The origin of the character.
86 @cached_property 87 def width(self) -> float: 88 """The width of the character's bounding box.""" 89 if self.rotation: 90 return self.bbox.height 91 return self.bbox.width
The width of the character's bounding box.
93 @cached_property 94 def height(self) -> float: 95 """The height of the character's bounding box.""" 96 if self.rotation: 97 return self.bbox.width 98 return self.bbox.height
The height of the character's bounding box.
100 @cached_property 101 def tbbox(self) -> Rectangle: 102 """The tight bounding box of the character.""" 103 tbbox = Rectangle(*self._text.get_charbox(self._index)) 104 if self._page.rotation: 105 tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, tbbox.p1.y, self._page.height - tbbox.p0.x) 106 return tbbox
The tight bounding box of the character.
108 @property 109 def bbox(self) -> Rectangle: 110 """ 111 The loose bounding box of the character. 112 .. note:: 113 If the loose bounding box is not available, the tight bounding box 114 is used instead. 115 """ 116 if not self._bbox.width or not self._bbox.height: 117 return self.tbbox 118 return self._bbox
The loose bounding box of the character.
If the loose bounding box is not available, the tight bounding box is used instead.
120 @cached_property 121 def twidth(self) -> float: 122 """The width of the character's tight bounding box.""" 123 return self.tbbox.width
The width of the character's tight bounding box.
125 @cached_property 126 def theight(self) -> float: 127 """The height of the character's tight bounding box.""" 128 return self.tbbox.height
The height of the character's tight bounding box.
130 @cached_property 131 def render_mode(self) -> RenderMode: 132 """The render mode of the character.""" 133 return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))
The render mode of the character.
135 @cached_property 136 def rotation(self) -> int: 137 """The rotation of the character in degrees modulo 360.""" 138 # Special case for vertical text in rotated pages 139 if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xA, 0xD}: 140 return 90 141 if self._page.rotation and self._rotation: 142 return (self._page.rotation + self._rotation) % 360 143 return self._rotation
The rotation of the character in degrees modulo 360.
145 @cached_property 146 def size(self) -> float: 147 """The font size of the character.""" 148 return pp.raw.FPDFText_GetFontSize(self._text, self._index)
The font size of the character.
150 @cached_property 151 def weight(self) -> int: 152 """The font weight of the character.""" 153 return pp.raw.FPDFText_GetFontWeight(self._text, self._index)
The font weight of the character.
155 @cached_property 156 def fill(self) -> int: 157 """The fill color of the character.""" 158 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 159 pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a) 160 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The fill color of the character.
162 @cached_property 163 def stroke(self) -> int: 164 """The stroke color of the character.""" 165 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 166 pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a) 167 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The stroke color of the character.
169 @cached_property 170 def font(self) -> str: 171 """The font name of the character.""" 172 return self._font_flags()[0]
The font name of the character.
174 @cached_property 175 def flags(self) -> int: 176 """The font flags of the character.""" 177 return self._font_flags()[1]
The font flags of the character.
179 def descr(self) -> str: 180 """Human-readable description of the character for debugging.""" 181 char = chr(self.unicode) 182 if not char.isprintable(): 183 char = hex(self.unicode) 184 return ( 185 f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " 186 f"{self.render_mode}, {self.font}, {hex(self.flags)}, " 187 f"{self.fill}, {self.stroke}, {repr(self.bbox)})" 188 )
Human-readable description of the character for debugging.
27 class RenderMode(Enum): 28 """Tells the PDF viewer how to render this character glyph.""" 29 30 UNKNOWN = -1 31 FILL = 0 32 STROKE = 1 33 FILL_STROKE = 2 34 INVISIBLE = 3 35 FILL_CLIP = 4 36 STROKE_CLIP = 5 37 FILL_STROKE_CLIP = 6 38 CLIP = 7
Tells the PDF viewer how to render this character glyph.
Inherited Members
- enum.Enum
- name
- value
12class Path(pp.PdfObject): 13 """ 14 PDF uses a subset of the PostScript graphics language, which draws vector 15 paths with various rendering options. We are only interested in the basic 16 properties, in particular, for recognizing table cell borders. 17 18 This class specializes `pypdfium2.PdfObject` to add accessors for graphics 19 containing vector paths of various configurations. 20 21 You must construct the paths by calling `modm_data.pdf.page.Page.paths`. 22 """ 23 24 class Type(Enum): 25 """Path Type""" 26 27 LINE = 0 28 BEZIER = 1 29 MOVE = 2 30 31 class Cap(Enum): 32 """Path Cap Type""" 33 34 BUTT = 0 35 ROUND = 1 36 PROJECTING_SQUARE = 2 37 38 class Join(Enum): 39 """Path Join Type""" 40 41 MITER = 0 42 ROUND = 1 43 BEVEL = 2 44 45 # Overwrite the PdfPageObject.__new__ function 46 def __new__(cls, *args, **kwargs): 47 return object.__new__(cls) 48 49 def __init__(self, obj): 50 """ 51 :param obj: PDF object of the path. 52 """ 53 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 54 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH 55 self.type = pp.raw.FPDF_PAGEOBJ_PATH 56 57 @cached_property 58 def matrix(self) -> pp.PdfMatrix: 59 """The transformation matrix.""" 60 return self.get_matrix() 61 62 @cached_property 63 def count(self) -> int: 64 """Number of segments in this path.""" 65 return pp.raw.FPDFPath_CountSegments(self) 66 67 @cached_property 68 def fill(self) -> int: 69 """The fill color encoded as 32-bit RGBA.""" 70 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 71 assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a) 72 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 73 74 @cached_property 75 def stroke(self) -> int: 76 """The stroke color encoded as 32-bit RGBA.""" 77 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 78 assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a) 79 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 80 81 @cached_property 82 def width(self) -> float: 83 """The stroke width.""" 84 width = ctypes.c_float() 85 assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width) 86 return width.value 87 88 @cached_property 89 def cap(self) -> Cap: 90 """Line cap type.""" 91 return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self)) 92 93 @cached_property 94 def join(self) -> Join: 95 """Line join type.""" 96 return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self)) 97 98 @cached_property 99 def bbox(self) -> Rectangle: 100 """ 101 Bounding box of the path. 102 .. warning:: 103 The bounding is only approximated using the control points! 104 Therefore bezier curves will likely have a larger bounding box. 105 """ 106 left, bottom = ctypes.c_float(), ctypes.c_float() 107 right, top = ctypes.c_float(), ctypes.c_float() 108 assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top) 109 bbox = Rectangle(left.value, bottom.value, right.value, top.value) 110 if self.page.rotation: 111 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 112 return bbox 113 114 @cached_property 115 def points(self) -> list[Point]: 116 """ 117 List of points of the path. If the path is closed, the first point is 118 added to the end of the list. 119 """ 120 points = [] 121 for ii in range(self.count): 122 seg = pp.raw.FPDFPath_GetPathSegment(self, ii) 123 ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg)) 124 # The first point should always be MOVETO 125 assert ii or ptype == Path.Type.MOVE 126 127 x, y = ctypes.c_float(), ctypes.c_float() 128 assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y) 129 x, y = self.matrix.on_point(x.value, y.value) 130 points.append(Point(x, y, type=ptype)) 131 132 if pp.raw.FPDFPathSegment_GetClose(seg): 133 points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE)) 134 135 if self.page.rotation: 136 points = [Point(y, self.page.height - x, type=p.type) for p in points] 137 return points 138 139 @cached_property 140 def lines(self) -> list[Line]: 141 """List of lines between the path points.""" 142 points = self.points 143 return [ 144 Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type) 145 for ii in range(len(points) - 1) 146 ] 147 148 def __repr__(self) -> str: 149 points = ",".join(repr(p) for p in self.points) 150 return f"P{self.count}={points}"
PDF uses a subset of the PostScript graphics language, which draws vector paths with various rendering options. We are only interested in the basic properties, in particular, for recognizing table cell borders.
This class specializes pypdfium2.PdfObject
to add accessors for graphics
containing vector paths of various configurations.
You must construct the paths by calling modm_data.pdf.page.Page.paths
.
49 def __init__(self, obj): 50 """ 51 :param obj: PDF object of the path. 52 """ 53 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 54 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH 55 self.type = pp.raw.FPDF_PAGEOBJ_PATH
Parameters
- obj: PDF object of the path.
57 @cached_property 58 def matrix(self) -> pp.PdfMatrix: 59 """The transformation matrix.""" 60 return self.get_matrix()
The transformation matrix.
62 @cached_property 63 def count(self) -> int: 64 """Number of segments in this path.""" 65 return pp.raw.FPDFPath_CountSegments(self)
Number of segments in this path.
67 @cached_property 68 def fill(self) -> int: 69 """The fill color encoded as 32-bit RGBA.""" 70 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 71 assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a) 72 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The fill color encoded as 32-bit RGBA.
74 @cached_property 75 def stroke(self) -> int: 76 """The stroke color encoded as 32-bit RGBA.""" 77 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 78 assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a) 79 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The stroke color encoded as 32-bit RGBA.
81 @cached_property 82 def width(self) -> float: 83 """The stroke width.""" 84 width = ctypes.c_float() 85 assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width) 86 return width.value
The stroke width.
88 @cached_property 89 def cap(self) -> Cap: 90 """Line cap type.""" 91 return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self))
Line cap type.
93 @cached_property 94 def join(self) -> Join: 95 """Line join type.""" 96 return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self))
Line join type.
98 @cached_property 99 def bbox(self) -> Rectangle: 100 """ 101 Bounding box of the path. 102 .. warning:: 103 The bounding is only approximated using the control points! 104 Therefore bezier curves will likely have a larger bounding box. 105 """ 106 left, bottom = ctypes.c_float(), ctypes.c_float() 107 right, top = ctypes.c_float(), ctypes.c_float() 108 assert pp.raw.FPDFPageObj_GetBounds(self, left, bottom, right, top) 109 bbox = Rectangle(left.value, bottom.value, right.value, top.value) 110 if self.page.rotation: 111 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 112 return bbox
Bounding box of the path.
The bounding is only approximated using the control points! Therefore bezier curves will likely have a larger bounding box.
114 @cached_property 115 def points(self) -> list[Point]: 116 """ 117 List of points of the path. If the path is closed, the first point is 118 added to the end of the list. 119 """ 120 points = [] 121 for ii in range(self.count): 122 seg = pp.raw.FPDFPath_GetPathSegment(self, ii) 123 ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg)) 124 # The first point should always be MOVETO 125 assert ii or ptype == Path.Type.MOVE 126 127 x, y = ctypes.c_float(), ctypes.c_float() 128 assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y) 129 x, y = self.matrix.on_point(x.value, y.value) 130 points.append(Point(x, y, type=ptype)) 131 132 if pp.raw.FPDFPathSegment_GetClose(seg): 133 points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE)) 134 135 if self.page.rotation: 136 points = [Point(y, self.page.height - x, type=p.type) for p in points] 137 return points
List of points of the path. If the path is closed, the first point is added to the end of the list.
139 @cached_property 140 def lines(self) -> list[Line]: 141 """List of lines between the path points.""" 142 points = self.points 143 return [ 144 Line(points[ii], points[ii + 1], width=self.width, type=points[ii + 1].type) 145 for ii in range(len(points) - 1) 146 ]
List of lines between the path points.
Inherited Members
- pypdfium2._helpers.pageobjects.PdfObject
- parent
- get_pos
- get_matrix
- set_matrix
- transform
- pypdfium2.internal.bases.AutoCloseable
- close
Path Type
Inherited Members
- enum.Enum
- name
- value
Path Cap Type
Inherited Members
- enum.Enum
- name
- value
Path Join Type
Inherited Members
- enum.Enum
- name
- value
10class Image(pp.PdfImage): 11 """ 12 This class extends `pypdfium2.PdfImage` to align it with the interface of 13 the `Path` class so that it can be used in the same 14 algorithms without filtering. 15 16 You must construct the images by calling `modm_data.pdf.page.Page.images`. 17 18 .. note:: Images are currently ignored. 19 """ 20 21 # Overwrite the PdfPageObject.__new__ function 22 def __new__(cls, *args, **kwargs): 23 return object.__new__(cls) 24 25 def __init__(self, obj): 26 """ 27 :param obj: Page object of the image. 28 """ 29 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 30 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE 31 self.type = pp.raw.FPDF_PAGEOBJ_IMAGE 32 33 self.count: int = 4 34 """Number of segments. Always 4 due to rectangular image form. 35 (For compatibility with `Path.count`.)""" 36 self.stroke: int = 0 37 """The border stroke color. Always 0. 38 (For compatibility with `Path.stroke`.)""" 39 self.fill: int = 0 40 """The image fill color. Always 0. 41 (For compatibility with `Path.fill`.)""" 42 self.width: float = 0 43 """The border line width. Always 0. 44 (For compatibility with `Path.width`.)""" 45 46 @cached_property 47 def matrix(self) -> pp.PdfMatrix: 48 """The transformation matrix.""" 49 return self.get_matrix() 50 51 @cached_property 52 def bbox(self) -> Rectangle: 53 """The bounding box of the image.""" 54 bbox = Rectangle(*self.get_pos()) 55 if self.page.rotation: 56 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 57 return bbox 58 59 @cached_property 60 def points(self) -> list[Point]: 61 """ 62 The 4 points of the bounding box. 63 (For compatibility with `Path.points`.) 64 """ 65 points = self.bbox.points 66 if self.page.rotation: 67 points = [Point(p.y, self.page.height - p.x, p.type) for p in points] 68 return points 69 70 @cached_property 71 def lines(self) -> list[Line]: 72 """ 73 The 4 lines of the bounding box. 74 (For compatibility with `Path.lines`.) 75 """ 76 p = self.points 77 return [ 78 Line(p[0], p[1], p[1].type, 0), 79 Line(p[1], p[2], p[2].type, 0), 80 Line(p[2], p[3], p[3].type, 0), 81 Line(p[3], p[0], p[0].type, 0), 82 ] 83 84 def __repr__(self) -> str: 85 return f"I{self.bbox}"
This class extends pypdfium2.PdfImage
to align it with the interface of
the Path
class so that it can be used in the same
algorithms without filtering.
You must construct the images by calling modm_data.pdf.page.Page.images
.
Images are currently ignored.
25 def __init__(self, obj): 26 """ 27 :param obj: Page object of the image. 28 """ 29 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 30 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE 31 self.type = pp.raw.FPDF_PAGEOBJ_IMAGE 32 33 self.count: int = 4 34 """Number of segments. Always 4 due to rectangular image form. 35 (For compatibility with `Path.count`.)""" 36 self.stroke: int = 0 37 """The border stroke color. Always 0. 38 (For compatibility with `Path.stroke`.)""" 39 self.fill: int = 0 40 """The image fill color. Always 0. 41 (For compatibility with `Path.fill`.)""" 42 self.width: float = 0 43 """The border line width. Always 0. 44 (For compatibility with `Path.width`.)"""
Parameters
- obj: Page object of the image.
Number of segments. Always 4 due to rectangular image form.
(For compatibility with Path.count
.)
46 @cached_property 47 def matrix(self) -> pp.PdfMatrix: 48 """The transformation matrix.""" 49 return self.get_matrix()
The transformation matrix.
51 @cached_property 52 def bbox(self) -> Rectangle: 53 """The bounding box of the image.""" 54 bbox = Rectangle(*self.get_pos()) 55 if self.page.rotation: 56 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x) 57 return bbox
The bounding box of the image.
59 @cached_property 60 def points(self) -> list[Point]: 61 """ 62 The 4 points of the bounding box. 63 (For compatibility with `Path.points`.) 64 """ 65 points = self.bbox.points 66 if self.page.rotation: 67 points = [Point(p.y, self.page.height - p.x, p.type) for p in points] 68 return points
The 4 points of the bounding box.
(For compatibility with Path.points
.)
70 @cached_property 71 def lines(self) -> list[Line]: 72 """ 73 The 4 lines of the bounding box. 74 (For compatibility with `Path.lines`.) 75 """ 76 p = self.points 77 return [ 78 Line(p[0], p[1], p[1].type, 0), 79 Line(p[1], p[2], p[2].type, 0), 80 Line(p[2], p[3], p[3].type, 0), 81 Line(p[3], p[0], p[0].type, 0), 82 ]
The 4 lines of the bounding box.
(For compatibility with Path.lines
.)
Inherited Members
- pypdfium2._helpers.pageobjects.PdfImage
- SIMPLE_FILTERS
- new
- get_metadata
- get_size
- load_jpeg
- set_bitmap
- get_bitmap
- get_data
- get_filters
- extract
- pypdfium2._helpers.pageobjects.PdfObject
- parent
- get_pos
- get_matrix
- set_matrix
- transform
- pypdfium2.internal.bases.AutoCloseable
- close
11class ObjLink: 12 """ 13 An internal reference to other objects by an identifier giving the bounding 14 box and destination page. These links can be extracted by calling the 15 `modm_data.pdf.page.Page.objlinks` property. 16 """ 17 18 def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK): # noqa: F821 19 """ 20 :param page: Page containing the link, used to compute bounding box. 21 :param link: Raw link object. 22 """ 23 self._page = page 24 self._dest = pp.raw.FPDFLink_GetDest(page.pdf, link) 25 26 bbox = pp.raw.FS_RECTF() 27 assert pp.raw.FPDFLink_GetAnnotRect(link, bbox) 28 bbox = Rectangle(bbox) 29 if page.rotation: 30 bbox = Rectangle(bbox.p0.y, page.height - bbox.p1.x, bbox.p1.y, page.height - bbox.p0.x) 31 self.bbox: Rectangle = bbox 32 """Bounding box of the link source""" 33 34 @cached_property 35 def page_index(self) -> int: 36 """0-indexed page number of the link destination.""" 37 return pp.raw.FPDFDest_GetDestPageIndex(self._page.pdf, self._dest) 38 39 def __repr__(self) -> str: 40 return f"Obj({self.page_index})"
An internal reference to other objects by an identifier giving the bounding
box and destination page. These links can be extracted by calling the
modm_data.pdf.page.Page.objlinks
property.
18 def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK): # noqa: F821 19 """ 20 :param page: Page containing the link, used to compute bounding box. 21 :param link: Raw link object. 22 """ 23 self._page = page 24 self._dest = pp.raw.FPDFLink_GetDest(page.pdf, link) 25 26 bbox = pp.raw.FS_RECTF() 27 assert pp.raw.FPDFLink_GetAnnotRect(link, bbox) 28 bbox = Rectangle(bbox) 29 if page.rotation: 30 bbox = Rectangle(bbox.p0.y, page.height - bbox.p1.x, bbox.p1.y, page.height - bbox.p0.x) 31 self.bbox: Rectangle = bbox 32 """Bounding box of the link source"""
Parameters
- page: Page containing the link, used to compute bounding box.
- link: Raw link object.
43class WebLink: 44 """ 45 An external reference to URLs giving the bounding box and destination URL. 46 These links can be extracted by calling the 47 `modm_data.pdf.page.Page.weblinks` property. 48 """ 49 50 def __init__(self, page: "modm_data.pdf.Page", index: int): # noqa: F821 51 """ 52 :param page: Page containing the link, used to compute bounding box. 53 :param index: 0-index of the weblink object. 54 """ 55 self._page = page 56 self._link = page._linkpage 57 self._index = index 58 59 @cached_property 60 def bbox_count(self) -> int: 61 """The number of bounding boxes associated with this weblink.""" 62 return pp.raw.FPDFLink_CountRects(self._link, self._index) 63 64 @cached_property 65 def bboxes(self) -> list[Rectangle]: 66 """The bounding boxes associated with this weblink.""" 67 bboxes = [] 68 for ii in range(self.bbox_count): 69 x0, y0 = ctypes.c_double(), ctypes.c_double() 70 x1, y1 = ctypes.c_double(), ctypes.c_double() 71 assert pp.raw.FPDFLink_GetRect(self._link, self._index, ii, x0, y1, x1, y0) 72 bboxes.append(Rectangle(x0.value, y0.value, x1.value, y1.value)) 73 if self._page.rotation: 74 bboxes = [ 75 Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 76 for bbox in bboxes 77 ] 78 return bboxes 79 80 @cached_property 81 def range(self) -> tuple[int, int]: 82 """Start and end index of the characters associated with this link.""" 83 cstart = ctypes.c_int() 84 ccount = ctypes.c_int() 85 assert pp.raw.FPDFLink_GetTextRange(self._link, self._index, cstart, ccount) 86 return (cstart.value, cstart.value + ccount.value) 87 88 @cached_property 89 def url(self) -> str: 90 """The URL string of this link.""" 91 length = 1000 92 cbuffer = ctypes.c_ushort * length 93 cbuffer = cbuffer() 94 retlen = pp.raw.FPDFLink_GetURL(self._link, self._index, cbuffer, length) 95 assert retlen < length 96 return bytes(cbuffer).decode("utf-16-le").strip("\x00") 97 98 def __repr__(self) -> str: 99 return f"Url({self.url})"
An external reference to URLs giving the bounding box and destination URL.
These links can be extracted by calling the
modm_data.pdf.page.Page.weblinks
property.
50 def __init__(self, page: "modm_data.pdf.Page", index: int): # noqa: F821 51 """ 52 :param page: Page containing the link, used to compute bounding box. 53 :param index: 0-index of the weblink object. 54 """ 55 self._page = page 56 self._link = page._linkpage 57 self._index = index
Parameters
- page: Page containing the link, used to compute bounding box.
- index: 0-index of the weblink object.
59 @cached_property 60 def bbox_count(self) -> int: 61 """The number of bounding boxes associated with this weblink.""" 62 return pp.raw.FPDFLink_CountRects(self._link, self._index)
The number of bounding boxes associated with this weblink.
64 @cached_property 65 def bboxes(self) -> list[Rectangle]: 66 """The bounding boxes associated with this weblink.""" 67 bboxes = [] 68 for ii in range(self.bbox_count): 69 x0, y0 = ctypes.c_double(), ctypes.c_double() 70 x1, y1 = ctypes.c_double(), ctypes.c_double() 71 assert pp.raw.FPDFLink_GetRect(self._link, self._index, ii, x0, y1, x1, y0) 72 bboxes.append(Rectangle(x0.value, y0.value, x1.value, y1.value)) 73 if self._page.rotation: 74 bboxes = [ 75 Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x) 76 for bbox in bboxes 77 ] 78 return bboxes
The bounding boxes associated with this weblink.
80 @cached_property 81 def range(self) -> tuple[int, int]: 82 """Start and end index of the characters associated with this link.""" 83 cstart = ctypes.c_int() 84 ccount = ctypes.c_int() 85 assert pp.raw.FPDFLink_GetTextRange(self._link, self._index, cstart, ccount) 86 return (cstart.value, cstart.value + ccount.value)
Start and end index of the characters associated with this link.
88 @cached_property 89 def url(self) -> str: 90 """The URL string of this link.""" 91 length = 1000 92 cbuffer = ctypes.c_ushort * length 93 cbuffer = cbuffer() 94 retlen = pp.raw.FPDFLink_GetURL(self._link, self._index, cbuffer, length) 95 assert retlen < length 96 return bytes(cbuffer).decode("utf-16-le").strip("\x00")
The URL string of this link.
11class Structure: 12 """ 13 A tagged PDF/UA (Universal Accessibility) contains the structure of content 14 as a tree data structure with similar semantics to HTML. Sadly, the quality 15 of the tags depends heavily on the PDF creation software. See 16 [Overview of PDF tags](https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/). 17 18 An example of an accessible pdf that can be inspected via these classes: 19 [Rock On, D.C. Music Festival](https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf). 20 21 This class is a convenience wrapper around [the pdfium structtree methods]( 22 https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h). 23 """ 24 25 def __init__(self, page: "modm_data.pdf.page.Page", element: pp.raw.FPDF_STRUCTELEMENT, parent: "Structure" = None): # noqa: F821 26 self._page = page 27 self._element = element 28 self.parent: Structure = weakref.ref(parent) if parent else None 29 """The parent node.""" 30 31 def _get_string(self, function) -> str: 32 length = function(self._element, 0, 0) 33 clength = ctypes.c_ulong(length) 34 cbuffer = ctypes.create_string_buffer(length) 35 function(self._element, cbuffer, clength) 36 return bytes(cbuffer).decode("utf-16-le", errors="ignore") 37 38 @cached_property 39 def title(self) -> str: 40 """Title `/T`""" 41 return self._get_string(pp.raw.FPDF_StructElement_GetTitle) 42 43 @cached_property 44 def actual_text(self) -> str: 45 """The actual text.""" 46 return self._get_string(pp.raw.FPDF_StructElement_GetActualText) 47 48 @cached_property 49 def alt_text(self) -> str: 50 """Alternate Text""" 51 return self._get_string(pp.raw.FPDF_StructElement_GetAltText) 52 53 @cached_property 54 def type(self) -> str: 55 """Type `/S`""" 56 return self._get_string(pp.raw.FPDF_StructElement_GetType) 57 58 @cached_property 59 def obj_type(self) -> str: 60 """Object Type `/Type`""" 61 return self._get_string(pp.raw.FPDF_StructElement_GetObjType) 62 63 @cached_property 64 def language(self) -> str: 65 """The case-insensitive IETF BCP 47 language code.""" 66 return self._get_string(pp.raw.FPDF_StructElement_GetLang) 67 68 @cached_property 69 def id(self) -> str: 70 """Identifier""" 71 return self._get_string(pp.raw.FPDF_StructElement_GetID) 72 73 @cached_property 74 def marked_ids(self) -> list[int]: 75 """List of marked content identifiers""" 76 ids = [] 77 for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)): 78 if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1: 79 ids.append(mcid) 80 return ids 81 82 @cached_property 83 def attributes(self) -> dict[str, str | bool | float]: 84 """ 85 All attributes of this structure element as a dictionary. 86 87 .. note:: 88 Due to limitations of the pdfium API, attribute arrays cannot be 89 extracted! The values are marked as `[?]` in the dictionary. 90 """ 91 kv = {} 92 for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)): 93 attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex) 94 for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)): 95 # Get the name 96 clength = ctypes.c_ulong(0) 97 cname = ctypes.create_string_buffer(1) # workaround to get length 98 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength) 99 cname = ctypes.create_string_buffer(clength.value) 100 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength) 101 name = cname.raw.decode("utf-8", errors="ignore") 102 103 # Get the type 104 atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname) 105 assert atype != pp.raw.FPDF_OBJECT_UNKNOWN 106 107 # Then get each type individually 108 match atype: 109 case pp.raw.FPDF_OBJECT_BOOLEAN: 110 cbool = ctypes.bool() 111 assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool) 112 kv[name] = cbool.value 113 114 case pp.raw.FPDF_OBJECT_NUMBER: 115 cfloat = ctypes.c_float() 116 assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat) 117 kv[name] = cfloat.value 118 119 case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME: 120 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength) 121 cattrname = ctypes.create_string_buffer(clength.value * 2) 122 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength) 123 kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1] 124 125 # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed? 126 # case pp.raw.FPDF_OBJECT_ARRAY: 127 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength) 128 # cblob = ctypes.create_string_buffer(clength.value) 129 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength) 130 # kv[name] = cblob.raw 131 132 case pp.raw.FPDF_OBJECT_ARRAY: 133 kv[name] = "[?]" 134 135 case _: 136 kv[name] = f"[unknown={atype}?]" 137 return kv 138 139 @cache 140 def child(self, index: int) -> "Structure": 141 """ 142 :param index: 0-index of child. 143 :return: Child structure. 144 """ 145 index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index) 146 return Structure(self._page, index, self) 147 148 @property 149 def children(self) -> list: 150 """All child structures.""" 151 count = pp.raw.FPDF_StructElement_CountChildren(self._element) 152 for ii in range(count): 153 yield self.child(ii) 154 155 def descr(self, indent=0) -> str: 156 """Description including all children via indentation.""" 157 string = " " * indent + repr(self) + "\n" 158 for child in self.children: 159 string += child.descr(indent + 4) 160 return string 161 162 def __repr__(self) -> str: 163 values = [] 164 if self.type: 165 values.append(f"type={self.type}") 166 if self.title: 167 values.append(f"title={self.title}") 168 if self.actual_text: 169 values.append(f"act_text={self.actual_text}") 170 if self.alt_text: 171 values.append(f"alt_text={self.alt_text}") 172 if self.id: 173 values.append(f"id={self.id}") 174 values += [f"mid={i}" for i in self.marked_ids] 175 values += [f"{k}={v}" for k, v in self.attributes.items()] 176 return f"S({','.join(map(str, values))})"
A tagged PDF/UA (Universal Accessibility) contains the structure of content as a tree data structure with similar semantics to HTML. Sadly, the quality of the tags depends heavily on the PDF creation software. See Overview of PDF tags.
An example of an accessible pdf that can be inspected via these classes: Rock On, D.C. Music Festival.
This class is a convenience wrapper around the pdfium structtree methods.
38 @cached_property 39 def title(self) -> str: 40 """Title `/T`""" 41 return self._get_string(pp.raw.FPDF_StructElement_GetTitle)
Title /T
43 @cached_property 44 def actual_text(self) -> str: 45 """The actual text.""" 46 return self._get_string(pp.raw.FPDF_StructElement_GetActualText)
The actual text.
48 @cached_property 49 def alt_text(self) -> str: 50 """Alternate Text""" 51 return self._get_string(pp.raw.FPDF_StructElement_GetAltText)
Alternate Text
53 @cached_property 54 def type(self) -> str: 55 """Type `/S`""" 56 return self._get_string(pp.raw.FPDF_StructElement_GetType)
Type /S
58 @cached_property 59 def obj_type(self) -> str: 60 """Object Type `/Type`""" 61 return self._get_string(pp.raw.FPDF_StructElement_GetObjType)
Object Type /Type
63 @cached_property 64 def language(self) -> str: 65 """The case-insensitive IETF BCP 47 language code.""" 66 return self._get_string(pp.raw.FPDF_StructElement_GetLang)
The case-insensitive IETF BCP 47 language code.
68 @cached_property 69 def id(self) -> str: 70 """Identifier""" 71 return self._get_string(pp.raw.FPDF_StructElement_GetID)
Identifier
73 @cached_property 74 def marked_ids(self) -> list[int]: 75 """List of marked content identifiers""" 76 ids = [] 77 for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)): 78 if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1: 79 ids.append(mcid) 80 return ids
List of marked content identifiers
82 @cached_property 83 def attributes(self) -> dict[str, str | bool | float]: 84 """ 85 All attributes of this structure element as a dictionary. 86 87 .. note:: 88 Due to limitations of the pdfium API, attribute arrays cannot be 89 extracted! The values are marked as `[?]` in the dictionary. 90 """ 91 kv = {} 92 for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)): 93 attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex) 94 for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)): 95 # Get the name 96 clength = ctypes.c_ulong(0) 97 cname = ctypes.create_string_buffer(1) # workaround to get length 98 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength) 99 cname = ctypes.create_string_buffer(clength.value) 100 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength) 101 name = cname.raw.decode("utf-8", errors="ignore") 102 103 # Get the type 104 atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname) 105 assert atype != pp.raw.FPDF_OBJECT_UNKNOWN 106 107 # Then get each type individually 108 match atype: 109 case pp.raw.FPDF_OBJECT_BOOLEAN: 110 cbool = ctypes.bool() 111 assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool) 112 kv[name] = cbool.value 113 114 case pp.raw.FPDF_OBJECT_NUMBER: 115 cfloat = ctypes.c_float() 116 assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat) 117 kv[name] = cfloat.value 118 119 case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME: 120 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength) 121 cattrname = ctypes.create_string_buffer(clength.value * 2) 122 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength) 123 kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[: clength.value - 1] 124 125 # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed? 126 # case pp.raw.FPDF_OBJECT_ARRAY: 127 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength) 128 # cblob = ctypes.create_string_buffer(clength.value) 129 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength) 130 # kv[name] = cblob.raw 131 132 case pp.raw.FPDF_OBJECT_ARRAY: 133 kv[name] = "[?]" 134 135 case _: 136 kv[name] = f"[unknown={atype}?]" 137 return kv
All attributes of this structure element as a dictionary.
Due to limitations of the pdfium API, attribute arrays cannot be
extracted! The values are marked as [?]
in the dictionary.
139 @cache 140 def child(self, index: int) -> "Structure": 141 """ 142 :param index: 0-index of child. 143 :return: Child structure. 144 """ 145 index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index) 146 return Structure(self._page, index, self)
Parameters
- index: 0-index of child.
Returns
Child structure.
148 @property 149 def children(self) -> list: 150 """All child structures.""" 151 count = pp.raw.FPDF_StructElement_CountChildren(self._element) 152 for ii in range(count): 153 yield self.child(ii)
All child structures.
155 def descr(self, indent=0) -> str: 156 """Description including all children via indentation.""" 157 string = " " * indent + repr(self) + "\n" 158 for child in self.children: 159 string += child.descr(indent + 4) 160 return string
Description including all children via indentation.