modm_data.pdf.structure

Tagged PDFs

A tagged PDF/UA (Universal Accessibility) contains the structure of content as a tree data structure with similar semantics to HTML. Sadly, the quality of the tags depends heavily on the PDF creation software. See Overview of PDF tags.

An example of an accessible pdf that can be inspected via these classes: Rock On, D.C. Music Festival.

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4"""
  5# Tagged PDFs
  6
  7A tagged PDF/UA (Universal Accessibility) contains the structure of content as a
  8tree data structure with similar semantics to HTML. Sadly, the quality of the
  9tags depends heavily on the PDF creation software. See [Overview of PDF tags](
 10https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/).
 11
 12An example of an accessible pdf that can be inspected via these classes:
 13[Rock On, D.C. Music Festival](
 14https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf).
 15"""
 16
 17import ctypes
 18from functools import cached_property, cache
 19import pypdfium2 as pp
 20import weakref
 21
 22
 23class Structure:
 24    """
 25    A PDF/UA ("tagged PDF") contains the structure of content as a tree data
 26    structure with similar semantics to HTML.
 27
 28    This class is a convenience wrapper around [the pdfium structtree methods](
 29    https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h).
 30    """
 31    def __init__(self, page: "modm_data.pdf.page.Page",
 32                 element: pp.raw.FPDF_STRUCTELEMENT,
 33                 parent: "Structure" = None):
 34        self._page = page
 35        self._element = element
 36        self.parent: Structure = weakref.ref(parent) if parent else None
 37        """The parent node."""
 38
 39    def _get_string(self, function) -> str:
 40        length = function(self._element, 0, 0)
 41        clength = ctypes.c_ulong(length)
 42        cbuffer = ctypes.create_string_buffer(length)
 43        function(self._element, cbuffer, clength)
 44        return bytes(cbuffer).decode("utf-16-le", errors="ignore")
 45
 46    @cached_property
 47    def title(self) -> str:
 48        """Title `/T`"""
 49        return self._get_string(pp.raw.FPDF_StructElement_GetTitle)
 50
 51    @cached_property
 52    def actual_text(self) -> str:
 53        """The actual text."""
 54        return self._get_string(pp.raw.FPDF_StructElement_GetActualText)
 55
 56    @cached_property
 57    def alt_text(self) -> str:
 58        """Alternate Text"""
 59        return self._get_string(pp.raw.FPDF_StructElement_GetAltText)
 60
 61    @cached_property
 62    def type(self) -> str:
 63        """Type `/S`"""
 64        return self._get_string(pp.raw.FPDF_StructElement_GetType)
 65
 66    @cached_property
 67    def obj_type(self) -> str:
 68        """Object Type `/Type`"""
 69        return self._get_string(pp.raw.FPDF_StructElement_GetObjType)
 70
 71    @cached_property
 72    def language(self) -> str:
 73        """The case-insensitive IETF BCP 47 language code."""
 74        return self._get_string(pp.raw.FPDF_StructElement_GetLang)
 75
 76    @cached_property
 77    def id(self) -> str:
 78        """Identifier"""
 79        return self._get_string(pp.raw.FPDF_StructElement_GetID)
 80
 81    @cached_property
 82    def marked_ids(self) -> list[int]:
 83        """List of marked content identifiers"""
 84        ids = []
 85        for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)):
 86            if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1:
 87                ids.append(mcid)
 88        return ids
 89
 90    @cached_property
 91    def attributes(self) -> dict[str, str|bool|float]:
 92        """
 93        All attributes of this structure element as a dictionary.
 94
 95        .. note::
 96            Due to limitations of the pdfium API, attribute arrays cannot be
 97            extracted! The values are marked as `[?]` in the dictionary.
 98        """
 99        kv = {}
100        for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)):
101            attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex)
102            for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)):
103                # Get the name
104                clength = ctypes.c_ulong(0)
105                cname = ctypes.create_string_buffer(1) # workaround to get length
106                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength)
107                cname = ctypes.create_string_buffer(clength.value)
108                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength)
109                name = cname.raw.decode("utf-8", errors="ignore")
110
111                # Get the type
112                atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname)
113                assert atype != pp.raw.FPDF_OBJECT_UNKNOWN
114
115                # Then get each type individually
116                match atype:
117                    case pp.raw.FPDF_OBJECT_BOOLEAN:
118                        cbool = ctypes.bool()
119                        assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool)
120                        kv[name] = cbool.value
121
122                    case pp.raw.FPDF_OBJECT_NUMBER:
123                        cfloat = ctypes.c_float()
124                        assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat)
125                        kv[name] = cfloat.value
126
127                    case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME:
128                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength)
129                        cattrname = ctypes.create_string_buffer(clength.value*2)
130                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength)
131                        kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[:clength.value-1]
132
133                    # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed?
134                    # case pp.raw.FPDF_OBJECT_ARRAY:
135                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength)
136                    #     cblob = ctypes.create_string_buffer(clength.value)
137                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength)
138                    #     kv[name] = cblob.raw
139
140                    case pp.raw.FPDF_OBJECT_ARRAY:
141                        kv[name] = f"[?]"
142
143                    case _:
144                        kv[name] = f"[unknown={atype}?]"
145        return kv
146
147    @cache
148    def child(self, index: int) -> "Structure":
149        """
150        :param index: 0-index of child.
151        :return: Child structure.
152        """
153        index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index)
154        return Structure(self._page, index, self)
155
156    @property
157    def children(self) -> list:
158        """All child structures."""
159        count = pp.raw.FPDF_StructElement_CountChildren(self._element)
160        for ii in range(count):
161            yield self.child(ii)
162
163    def descr(self, indent=0) -> str:
164        """Description including all children via indentation."""
165        string = " " * indent + repr(self) + "\n"
166        for child in self.children:
167            string += child.descr(indent + 4)
168        return string
169
170    def __repr__(self) -> str:
171        values = []
172        if self.type: values.append(f"type={self.type}")
173        if self.title: values.append(f"title={self.title}")
174        if self.actual_text: values.append(f"act_text={self.actual_text}")
175        if self.alt_text: values.append(f"alt_text={self.alt_text}")
176        if self.id: values.append(f"id={self.id}")
177        values += [f"mid={i}" for i in self.marked_ids]
178        values += [f"{k}={v}" for k, v in self.attributes.items()]
179        return f"S({','.join(map(str, values))})"
class Structure:
 24class Structure:
 25    """
 26    A PDF/UA ("tagged PDF") contains the structure of content as a tree data
 27    structure with similar semantics to HTML.
 28
 29    This class is a convenience wrapper around [the pdfium structtree methods](
 30    https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h).
 31    """
 32    def __init__(self, page: "modm_data.pdf.page.Page",
 33                 element: pp.raw.FPDF_STRUCTELEMENT,
 34                 parent: "Structure" = None):
 35        self._page = page
 36        self._element = element
 37        self.parent: Structure = weakref.ref(parent) if parent else None
 38        """The parent node."""
 39
 40    def _get_string(self, function) -> str:
 41        length = function(self._element, 0, 0)
 42        clength = ctypes.c_ulong(length)
 43        cbuffer = ctypes.create_string_buffer(length)
 44        function(self._element, cbuffer, clength)
 45        return bytes(cbuffer).decode("utf-16-le", errors="ignore")
 46
 47    @cached_property
 48    def title(self) -> str:
 49        """Title `/T`"""
 50        return self._get_string(pp.raw.FPDF_StructElement_GetTitle)
 51
 52    @cached_property
 53    def actual_text(self) -> str:
 54        """The actual text."""
 55        return self._get_string(pp.raw.FPDF_StructElement_GetActualText)
 56
 57    @cached_property
 58    def alt_text(self) -> str:
 59        """Alternate Text"""
 60        return self._get_string(pp.raw.FPDF_StructElement_GetAltText)
 61
 62    @cached_property
 63    def type(self) -> str:
 64        """Type `/S`"""
 65        return self._get_string(pp.raw.FPDF_StructElement_GetType)
 66
 67    @cached_property
 68    def obj_type(self) -> str:
 69        """Object Type `/Type`"""
 70        return self._get_string(pp.raw.FPDF_StructElement_GetObjType)
 71
 72    @cached_property
 73    def language(self) -> str:
 74        """The case-insensitive IETF BCP 47 language code."""
 75        return self._get_string(pp.raw.FPDF_StructElement_GetLang)
 76
 77    @cached_property
 78    def id(self) -> str:
 79        """Identifier"""
 80        return self._get_string(pp.raw.FPDF_StructElement_GetID)
 81
 82    @cached_property
 83    def marked_ids(self) -> list[int]:
 84        """List of marked content identifiers"""
 85        ids = []
 86        for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)):
 87            if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1:
 88                ids.append(mcid)
 89        return ids
 90
 91    @cached_property
 92    def attributes(self) -> dict[str, str|bool|float]:
 93        """
 94        All attributes of this structure element as a dictionary.
 95
 96        .. note::
 97            Due to limitations of the pdfium API, attribute arrays cannot be
 98            extracted! The values are marked as `[?]` in the dictionary.
 99        """
100        kv = {}
101        for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)):
102            attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex)
103            for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)):
104                # Get the name
105                clength = ctypes.c_ulong(0)
106                cname = ctypes.create_string_buffer(1) # workaround to get length
107                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength)
108                cname = ctypes.create_string_buffer(clength.value)
109                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength)
110                name = cname.raw.decode("utf-8", errors="ignore")
111
112                # Get the type
113                atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname)
114                assert atype != pp.raw.FPDF_OBJECT_UNKNOWN
115
116                # Then get each type individually
117                match atype:
118                    case pp.raw.FPDF_OBJECT_BOOLEAN:
119                        cbool = ctypes.bool()
120                        assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool)
121                        kv[name] = cbool.value
122
123                    case pp.raw.FPDF_OBJECT_NUMBER:
124                        cfloat = ctypes.c_float()
125                        assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat)
126                        kv[name] = cfloat.value
127
128                    case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME:
129                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength)
130                        cattrname = ctypes.create_string_buffer(clength.value*2)
131                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength)
132                        kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[:clength.value-1]
133
134                    # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed?
135                    # case pp.raw.FPDF_OBJECT_ARRAY:
136                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength)
137                    #     cblob = ctypes.create_string_buffer(clength.value)
138                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength)
139                    #     kv[name] = cblob.raw
140
141                    case pp.raw.FPDF_OBJECT_ARRAY:
142                        kv[name] = f"[?]"
143
144                    case _:
145                        kv[name] = f"[unknown={atype}?]"
146        return kv
147
148    @cache
149    def child(self, index: int) -> "Structure":
150        """
151        :param index: 0-index of child.
152        :return: Child structure.
153        """
154        index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index)
155        return Structure(self._page, index, self)
156
157    @property
158    def children(self) -> list:
159        """All child structures."""
160        count = pp.raw.FPDF_StructElement_CountChildren(self._element)
161        for ii in range(count):
162            yield self.child(ii)
163
164    def descr(self, indent=0) -> str:
165        """Description including all children via indentation."""
166        string = " " * indent + repr(self) + "\n"
167        for child in self.children:
168            string += child.descr(indent + 4)
169        return string
170
171    def __repr__(self) -> str:
172        values = []
173        if self.type: values.append(f"type={self.type}")
174        if self.title: values.append(f"title={self.title}")
175        if self.actual_text: values.append(f"act_text={self.actual_text}")
176        if self.alt_text: values.append(f"alt_text={self.alt_text}")
177        if self.id: values.append(f"id={self.id}")
178        values += [f"mid={i}" for i in self.marked_ids]
179        values += [f"{k}={v}" for k, v in self.attributes.items()]
180        return f"S({','.join(map(str, values))})"

A PDF/UA ("tagged PDF") contains the structure of content as a tree data structure with similar semantics to HTML.

This class is a convenience wrapper around the pdfium structtree methods.

Structure( page: modm_data.pdf.page.Page, element: pypdfium2_raw.bindings.LP_struct_fpdf_structelement_t__, parent: Structure = None)
32    def __init__(self, page: "modm_data.pdf.page.Page",
33                 element: pp.raw.FPDF_STRUCTELEMENT,
34                 parent: "Structure" = None):
35        self._page = page
36        self._element = element
37        self.parent: Structure = weakref.ref(parent) if parent else None
38        """The parent node."""
parent: Structure

The parent node.

title: str
47    @cached_property
48    def title(self) -> str:
49        """Title `/T`"""
50        return self._get_string(pp.raw.FPDF_StructElement_GetTitle)

Title /T

actual_text: str
52    @cached_property
53    def actual_text(self) -> str:
54        """The actual text."""
55        return self._get_string(pp.raw.FPDF_StructElement_GetActualText)

The actual text.

alt_text: str
57    @cached_property
58    def alt_text(self) -> str:
59        """Alternate Text"""
60        return self._get_string(pp.raw.FPDF_StructElement_GetAltText)

Alternate Text

type: str
62    @cached_property
63    def type(self) -> str:
64        """Type `/S`"""
65        return self._get_string(pp.raw.FPDF_StructElement_GetType)

Type /S

obj_type: str
67    @cached_property
68    def obj_type(self) -> str:
69        """Object Type `/Type`"""
70        return self._get_string(pp.raw.FPDF_StructElement_GetObjType)

Object Type /Type

language: str
72    @cached_property
73    def language(self) -> str:
74        """The case-insensitive IETF BCP 47 language code."""
75        return self._get_string(pp.raw.FPDF_StructElement_GetLang)

The case-insensitive IETF BCP 47 language code.

id: str
77    @cached_property
78    def id(self) -> str:
79        """Identifier"""
80        return self._get_string(pp.raw.FPDF_StructElement_GetID)

Identifier

marked_ids: list[int]
82    @cached_property
83    def marked_ids(self) -> list[int]:
84        """List of marked content identifiers"""
85        ids = []
86        for index in range(pp.raw.FPDF_StructElement_GetMarkedContentIdCount(self._element)):
87            if (mcid := pp.raw.FPDF_StructElement_GetMarkedContentIdAtIndex(self._element, index)) != -1:
88                ids.append(mcid)
89        return ids

List of marked content identifiers

attributes: dict[str, str | bool | float]
 91    @cached_property
 92    def attributes(self) -> dict[str, str|bool|float]:
 93        """
 94        All attributes of this structure element as a dictionary.
 95
 96        .. note::
 97            Due to limitations of the pdfium API, attribute arrays cannot be
 98            extracted! The values are marked as `[?]` in the dictionary.
 99        """
100        kv = {}
101        for eindex in range(pp.raw.FPDF_StructElement_GetAttributeCount(self._element)):
102            attr = pp.raw.FPDF_StructElement_GetAttributeAtIndex(self._element, eindex)
103            for aindex in range(pp.raw.FPDF_StructElement_Attr_GetCount(attr)):
104                # Get the name
105                clength = ctypes.c_ulong(0)
106                cname = ctypes.create_string_buffer(1) # workaround to get length
107                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, 0, clength)
108                cname = ctypes.create_string_buffer(clength.value)
109                assert pp.raw.FPDF_StructElement_Attr_GetName(attr, aindex, cname, clength, clength)
110                name = cname.raw.decode("utf-8", errors="ignore")
111
112                # Get the type
113                atype = pp.raw.FPDF_StructElement_Attr_GetType(attr, cname)
114                assert atype != pp.raw.FPDF_OBJECT_UNKNOWN
115
116                # Then get each type individually
117                match atype:
118                    case pp.raw.FPDF_OBJECT_BOOLEAN:
119                        cbool = ctypes.bool()
120                        assert pp.raw.FPDF_StructElement_Attr_GetBooleanValue(attr, cname, cbool)
121                        kv[name] = cbool.value
122
123                    case pp.raw.FPDF_OBJECT_NUMBER:
124                        cfloat = ctypes.c_float()
125                        assert pp.raw.FPDF_StructElement_Attr_GetNumberValue(attr, cname, cfloat)
126                        kv[name] = cfloat.value
127
128                    case pp.raw.FPDF_OBJECT_STRING | pp.raw.FPDF_OBJECT_NAME:
129                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, 0, 0, clength)
130                        cattrname = ctypes.create_string_buffer(clength.value*2)
131                        assert pp.raw.FPDF_StructElement_Attr_GetStringValue(attr, cname, cattrname, clength, clength)
132                        kv[name] = cattrname.raw.decode("utf-16-le", errors="ignore")[:clength.value-1]
133
134                    # FIXME: FPDF_OBJECT_ARRAY is not a blob, but no other APIs are exposed?
135                    # case pp.raw.FPDF_OBJECT_ARRAY:
136                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, 0, 0, clength)
137                    #     cblob = ctypes.create_string_buffer(clength.value)
138                    #     assert pp.raw.FPDF_StructElement_Attr_GetBlobValue(attr, cname, cblob, clength, clength)
139                    #     kv[name] = cblob.raw
140
141                    case pp.raw.FPDF_OBJECT_ARRAY:
142                        kv[name] = f"[?]"
143
144                    case _:
145                        kv[name] = f"[unknown={atype}?]"
146        return kv

All attributes of this structure element as a dictionary.

Due to limitations of the pdfium API, attribute arrays cannot be extracted! The values are marked as [?] in the dictionary.

@cache
def child(self, index: int) -> Structure:
148    @cache
149    def child(self, index: int) -> "Structure":
150        """
151        :param index: 0-index of child.
152        :return: Child structure.
153        """
154        index = pp.raw.FPDF_StructElement_GetChildAtIndex(self._element, index)
155        return Structure(self._page, index, self)
Parameters
  • index: 0-index of child.
Returns

Child structure.

children: list
157    @property
158    def children(self) -> list:
159        """All child structures."""
160        count = pp.raw.FPDF_StructElement_CountChildren(self._element)
161        for ii in range(count):
162            yield self.child(ii)

All child structures.

def descr(self, indent=0) -> str:
164    def descr(self, indent=0) -> str:
165        """Description including all children via indentation."""
166        string = " " * indent + repr(self) + "\n"
167        for child in self.children:
168            string += child.descr(indent + 4)
169        return string

Description including all children via indentation.