modm_data.pdf.structure
Tagged PDFs
A tagged PDF/UA (Universal Accessibility) contains the structure of content as a tree data structure with similar semantics to HTML. Sadly, the quality of the tags depends heavily on the PDF creation software. See Overview of PDF tags.
An example of an accessible pdf that can be inspected via these classes: Rock On, D.C. Music Festival.
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# Tagged PDFs 6 7A tagged PDF/UA (Universal Accessibility) contains the structure of content as a 8tree data structure with similar semantics to HTML. Sadly, the quality of the 9tags depends heavily on the PDF creation software. See [Overview of PDF tags]( 10https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/). 11 12An example of an accessible pdf that can be inspected via these classes: 13[Rock On, D.C. Music Festival]( 14https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf). 15""" 16 17import ctypes 18from functools import cached_property, cache 19import pypdfium2 as pp 20import weakref 21 22 23class Structure: 24 """ 25 A PDF/UA ("tagged PDF") contains the structure of content as a tree data 26 structure with similar semantics to HTML. 27 28 This class is a convenience wrapper around [the pdfium structtree methods]( 29 https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h). 30 """ 31 def __init__(self, page: "modm_data.pdf.page.Page", 32 element: pp.raw.FPDF_STRUCTELEMENT, 33 parent: "Structure" = None): 34 self._page = page 35 self._element = element 36 self.parent: Structure = weakref.ref(parent) if parent else None 37 """The parent node.""" 38 39 def _get_string(self, function) -> str: 40 length = function(self._element, 0, 0) 41 clength = ctypes.c_ulong(length) 42 cbuffer = ctypes.create_string_buffer(length) 43 function(self._element, cbuffer, clength) 44 return bytes(cbuffer).decode("utf-16-le", errors="ignore") 45 46 @cached_property 47 def title(self) -> str: 48 """Title `/T`""" 49 return self._get_string(pp.raw.FPDF_StructElement_GetTitle) 50 51 @cached_property 52 def actual_text(self) -> str: 53 """The actual text.""" 54 return self._get_string(pp.raw.FPDF_StructElement_GetActualText) 55 56 @cached_property 57 def alt_text(self) -> str: 58 """Alternate Text""" 59 return self._get_string(pp.raw.FPDF_StructElement_GetAltText) 60 61 @cached_property 62 def type(self) -> str: 63 """Type `/S`""" 64 return self._get_string(pp.raw.FPDF_StructElement_GetType) 65 66 @cached_property 67 def obj_type(self) -> str: 68 """Object Type `/Type`""" 69 return self._get_string(pp.raw.FPDF_StructElement_GetObjType) 70 71 @cached_property 72 def language(self) -> str: 73 """The case-insensitive IETF BCP 47 language code.""" 74 return self._get_string(pp.raw.FPDF_StructElement_GetLang) 75 76 @cached_property 77 def id(self) -> str: 78 """Identifier""" 79 return self._get_string(pp.raw.FPDF_StructElement_GetID) 80 81 @cached_property 82 def marked_ids(self) -> list[int]: 83 """List of marked content identifiers""" 84 ids = [] 85 for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)): 86 if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1: 87 ids.append(mcid) 88 return ids 89 90 @cached_property 91 def attributes(self) -> dict[str, str|bool|float]: 92 """ 93 All attributes of this structure element as a dictionary. 94 95 .. note:: 96 Due to limitations of the pdfium API, attribute arrays cannot be 97 extracted! The values are marked as `[?]` in the dictionary. 98 """ 99 kv = {} 100 for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)): 101 attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex) 102 for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)): 103 # Get the name 104 clength = ctypes.c_ulong(0) 105 cname = ctypes.create_string_buffer(1) # workaround to get length 106 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength) 107 cname = ctypes.create_string_buffer(clength.value) 108 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength) 109 name = cname.raw.decode("utf-8", errors="ignore") 110 111 # Get the type 112 atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname) 113 assert atype != pp.raw.FPDF_OBJECT_UNKNOWN 114 115 # Then get each type individually 116 match atype: 117 case pp.raw.FPDF_OBJECT_BOOLEAN: 118 cbool = ctypes.bool() 119 assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool) 120 kv[name] = cbool.value 121 122 case pp.raw.FPDF_OBJECT_NUMBER: 123 cfloat = ctypes.c_float() 124 assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat) 125 kv[name] = cfloat.value 126 127 case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME: 128 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength) 129 cattrname = ctypes.create_string_buffer(clength.value*2) 130 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength) 131 kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[:clength.value-1] 132 133 # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed? 134 # case pp.raw.FPDF_OBJECT_ARRAY: 135 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength) 136 # cblob = ctypes.create_string_buffer(clength.value) 137 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength) 138 # kv[name] = cblob.raw 139 140 case pp.raw.FPDF_OBJECT_ARRAY: 141 kv[name] = f"[?]" 142 143 case _: 144 kv[name] = f"[unknown={atype}?]" 145 return kv 146 147 @cache 148 def child(self, index: int) -> "Structure": 149 """ 150 :param index: 0-index of child. 151 :return: Child structure. 152 """ 153 index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index) 154 return Structure(self._page, index, self) 155 156 @property 157 def children(self) -> list: 158 """All child structures.""" 159 count = pp.raw.FPDF_StructElement_CountChildren(self._element) 160 for ii in range(count): 161 yield self.child(ii) 162 163 def descr(self, indent=0) -> str: 164 """Description including all children via indentation.""" 165 string = " " * indent + repr(self) + "\n" 166 for child in self.children: 167 string += child.descr(indent + 4) 168 return string 169 170 def __repr__(self) -> str: 171 values = [] 172 if self.type: values.append(f"type={self.type}") 173 if self.title: values.append(f"title={self.title}") 174 if self.actual_text: values.append(f"act_text={self.actual_text}") 175 if self.alt_text: values.append(f"alt_text={self.alt_text}") 176 if self.id: values.append(f"id={self.id}") 177 values += [f"mid={i}" for i in self.marked_ids] 178 values += [f"{k}={v}" for k, v in self.attributes.items()] 179 return f"S({','.join(map(str, values))})"
24class Structure: 25 """ 26 A PDF/UA ("tagged PDF") contains the structure of content as a tree data 27 structure with similar semantics to HTML. 28 29 This class is a convenience wrapper around [the pdfium structtree methods]( 30 https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h). 31 """ 32 def __init__(self, page: "modm_data.pdf.page.Page", 33 element: pp.raw.FPDF_STRUCTELEMENT, 34 parent: "Structure" = None): 35 self._page = page 36 self._element = element 37 self.parent: Structure = weakref.ref(parent) if parent else None 38 """The parent node.""" 39 40 def _get_string(self, function) -> str: 41 length = function(self._element, 0, 0) 42 clength = ctypes.c_ulong(length) 43 cbuffer = ctypes.create_string_buffer(length) 44 function(self._element, cbuffer, clength) 45 return bytes(cbuffer).decode("utf-16-le", errors="ignore") 46 47 @cached_property 48 def title(self) -> str: 49 """Title `/T`""" 50 return self._get_string(pp.raw.FPDF_StructElement_GetTitle) 51 52 @cached_property 53 def actual_text(self) -> str: 54 """The actual text.""" 55 return self._get_string(pp.raw.FPDF_StructElement_GetActualText) 56 57 @cached_property 58 def alt_text(self) -> str: 59 """Alternate Text""" 60 return self._get_string(pp.raw.FPDF_StructElement_GetAltText) 61 62 @cached_property 63 def type(self) -> str: 64 """Type `/S`""" 65 return self._get_string(pp.raw.FPDF_StructElement_GetType) 66 67 @cached_property 68 def obj_type(self) -> str: 69 """Object Type `/Type`""" 70 return self._get_string(pp.raw.FPDF_StructElement_GetObjType) 71 72 @cached_property 73 def language(self) -> str: 74 """The case-insensitive IETF BCP 47 language code.""" 75 return self._get_string(pp.raw.FPDF_StructElement_GetLang) 76 77 @cached_property 78 def id(self) -> str: 79 """Identifier""" 80 return self._get_string(pp.raw.FPDF_StructElement_GetID) 81 82 @cached_property 83 def marked_ids(self) -> list[int]: 84 """List of marked content identifiers""" 85 ids = [] 86 for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)): 87 if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1: 88 ids.append(mcid) 89 return ids 90 91 @cached_property 92 def attributes(self) -> dict[str, str|bool|float]: 93 """ 94 All attributes of this structure element as a dictionary. 95 96 .. note:: 97 Due to limitations of the pdfium API, attribute arrays cannot be 98 extracted! The values are marked as `[?]` in the dictionary. 99 """ 100 kv = {} 101 for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)): 102 attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex) 103 for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)): 104 # Get the name 105 clength = ctypes.c_ulong(0) 106 cname = ctypes.create_string_buffer(1) # workaround to get length 107 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength) 108 cname = ctypes.create_string_buffer(clength.value) 109 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength) 110 name = cname.raw.decode("utf-8", errors="ignore") 111 112 # Get the type 113 atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname) 114 assert atype != pp.raw.FPDF_OBJECT_UNKNOWN 115 116 # Then get each type individually 117 match atype: 118 case pp.raw.FPDF_OBJECT_BOOLEAN: 119 cbool = ctypes.bool() 120 assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool) 121 kv[name] = cbool.value 122 123 case pp.raw.FPDF_OBJECT_NUMBER: 124 cfloat = ctypes.c_float() 125 assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat) 126 kv[name] = cfloat.value 127 128 case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME: 129 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength) 130 cattrname = ctypes.create_string_buffer(clength.value*2) 131 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength) 132 kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[:clength.value-1] 133 134 # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed? 135 # case pp.raw.FPDF_OBJECT_ARRAY: 136 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength) 137 # cblob = ctypes.create_string_buffer(clength.value) 138 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength) 139 # kv[name] = cblob.raw 140 141 case pp.raw.FPDF_OBJECT_ARRAY: 142 kv[name] = f"[?]" 143 144 case _: 145 kv[name] = f"[unknown={atype}?]" 146 return kv 147 148 @cache 149 def child(self, index: int) -> "Structure": 150 """ 151 :param index: 0-index of child. 152 :return: Child structure. 153 """ 154 index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index) 155 return Structure(self._page, index, self) 156 157 @property 158 def children(self) -> list: 159 """All child structures.""" 160 count = pp.raw.FPDF_StructElement_CountChildren(self._element) 161 for ii in range(count): 162 yield self.child(ii) 163 164 def descr(self, indent=0) -> str: 165 """Description including all children via indentation.""" 166 string = " " * indent + repr(self) + "\n" 167 for child in self.children: 168 string += child.descr(indent + 4) 169 return string 170 171 def __repr__(self) -> str: 172 values = [] 173 if self.type: values.append(f"type={self.type}") 174 if self.title: values.append(f"title={self.title}") 175 if self.actual_text: values.append(f"act_text={self.actual_text}") 176 if self.alt_text: values.append(f"alt_text={self.alt_text}") 177 if self.id: values.append(f"id={self.id}") 178 values += [f"mid={i}" for i in self.marked_ids] 179 values += [f"{k}={v}" for k, v in self.attributes.items()] 180 return f"S({','.join(map(str, values))})"
A PDF/UA ("tagged PDF") contains the structure of content as a tree data structure with similar semantics to HTML.
This class is a convenience wrapper around the pdfium structtree methods.
47 @cached_property 48 def title(self) -> str: 49 """Title `/T`""" 50 return self._get_string(pp.raw.FPDF_StructElement_GetTitle)
Title /T
52 @cached_property 53 def actual_text(self) -> str: 54 """The actual text.""" 55 return self._get_string(pp.raw.FPDF_StructElement_GetActualText)
The actual text.
57 @cached_property 58 def alt_text(self) -> str: 59 """Alternate Text""" 60 return self._get_string(pp.raw.FPDF_StructElement_GetAltText)
Alternate Text
62 @cached_property 63 def type(self) -> str: 64 """Type `/S`""" 65 return self._get_string(pp.raw.FPDF_StructElement_GetType)
Type /S
67 @cached_property 68 def obj_type(self) -> str: 69 """Object Type `/Type`""" 70 return self._get_string(pp.raw.FPDF_StructElement_GetObjType)
Object Type /Type
72 @cached_property 73 def language(self) -> str: 74 """The case-insensitive IETF BCP 47 language code.""" 75 return self._get_string(pp.raw.FPDF_StructElement_GetLang)
The case-insensitive IETF BCP 47 language code.
77 @cached_property 78 def id(self) -> str: 79 """Identifier""" 80 return self._get_string(pp.raw.FPDF_StructElement_GetID)
Identifier
82 @cached_property 83 def marked_ids(self) -> list[int]: 84 """List of marked content identifiers""" 85 ids = [] 86 for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)): 87 if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1: 88 ids.append(mcid) 89 return ids
List of marked content identifiers
91 @cached_property 92 def attributes(self) -> dict[str, str|bool|float]: 93 """ 94 All attributes of this structure element as a dictionary. 95 96 .. note:: 97 Due to limitations of the pdfium API, attribute arrays cannot be 98 extracted! The values are marked as `[?]` in the dictionary. 99 """ 100 kv = {} 101 for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)): 102 attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex) 103 for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)): 104 # Get the name 105 clength = ctypes.c_ulong(0) 106 cname = ctypes.create_string_buffer(1) # workaround to get length 107 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength) 108 cname = ctypes.create_string_buffer(clength.value) 109 assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength) 110 name = cname.raw.decode("utf-8", errors="ignore") 111 112 # Get the type 113 atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname) 114 assert atype != pp.raw.FPDF_OBJECT_UNKNOWN 115 116 # Then get each type individually 117 match atype: 118 case pp.raw.FPDF_OBJECT_BOOLEAN: 119 cbool = ctypes.bool() 120 assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool) 121 kv[name] = cbool.value 122 123 case pp.raw.FPDF_OBJECT_NUMBER: 124 cfloat = ctypes.c_float() 125 assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat) 126 kv[name] = cfloat.value 127 128 case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME: 129 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength) 130 cattrname = ctypes.create_string_buffer(clength.value*2) 131 assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength) 132 kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[:clength.value-1] 133 134 # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed? 135 # case pp.raw.FPDF_OBJECT_ARRAY: 136 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength) 137 # cblob = ctypes.create_string_buffer(clength.value) 138 # assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength) 139 # kv[name] = cblob.raw 140 141 case pp.raw.FPDF_OBJECT_ARRAY: 142 kv[name] = f"[?]" 143 144 case _: 145 kv[name] = f"[unknown={atype}?]" 146 return kv
All attributes of this structure element as a dictionary.
Due to limitations of the pdfium API, attribute arrays cannot be
extracted! The values are marked as [?]
in the dictionary.
148 @cache 149 def child(self, index: int) -> "Structure": 150 """ 151 :param index: 0-index of child. 152 :return: Child structure. 153 """ 154 index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index) 155 return Structure(self._page, index, self)
Parameters
- index: 0-index of child.
Returns
Child structure.
157 @property 158 def children(self) -> list: 159 """All child structures.""" 160 count = pp.raw.FPDF_StructElement_CountChildren(self._element) 161 for ii in range(count): 162 yield self.child(ii)
All child structures.
164 def descr(self, indent=0) -> str: 165 """Description including all children via indentation.""" 166 string = " " * indent + repr(self) + "\n" 167 for child in self.children: 168 string += child.descr(indent + 4) 169 return string
Description including all children via indentation.