modm_data.pdf.character

PDF Characters

Each character on the PDF page is represented by a character object, describing exactly where and how to render the associated glyph.

While there are font flags, PDF files typically use entirely different fonts to render normal, bold, and italic characters.

The character's loose bounding box may not always be available, since it must be explicitly provided by the font. The tight bounding box is only available as long as the glyph is renderable, so a space character may have a loose, but not a tight bounding box, or none at all.

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4"""
  5# PDF Characters
  6
  7Each character on the PDF page is represented by a character object, describing
  8exactly where and how to render the associated glyph.
  9
 10While there are font flags, PDF files typically use entirely different fonts to
 11render normal, bold, and italic characters.
 12
 13The character's loose bounding box may not always be available, since it must be
 14explicitly provided by the font. The tight bounding box is only available as
 15long as the glyph is renderable, so a space character may have a loose, but not
 16a tight bounding box, or none at all.
 17"""
 18
 19import math
 20import ctypes
 21from functools import cached_property
 22from enum import Enum
 23import pypdfium2 as pp
 24from ..utils import Rectangle, Point
 25
 26
 27class Character:
 28    """
 29    This class contains all information about a single character in the PDF
 30    page.
 31    """
 32    class RenderMode(Enum):
 33        """Tells the PDF viewer how to render this character glyph."""
 34        UNKNOWN = -1
 35        FILL = 0
 36        STROKE = 1
 37        FILL_STROKE = 2
 38        INVISIBLE = 3
 39        FILL_CLIP = 4
 40        STROKE_CLIP = 5
 41        FILL_STROKE_CLIP = 6
 42        CLIP = 7
 43
 44    def __init__(self, page: "modm_data.pdf.page.Page", index: int):
 45        """
 46        :param page: The page containing the character.
 47        :param index: The index of the character.
 48        """
 49        self._page = page
 50        self._text = page._text
 51        self._index = index
 52        self._font = None
 53        self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index)))
 54
 55        self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index)
 56        """The unicode value of the character."""
 57        self.objlink: "modm_data.pdf.link.ObjLink" = None
 58        """The object link of this character or `None`"""
 59        self.weblink: "modm_data.pdf.link.WebLink" = None
 60        """The web link of this character or `None`"""
 61
 62        bbox = Rectangle(*self._text.get_charbox(self._index, loose=True))
 63        if self._page.rotation:
 64            bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x,
 65                             bbox.p1.y, self._page.height - bbox.p0.x)
 66        self._bbox = bbox
 67
 68    def _font_flags(self) -> tuple[str, int]:
 69        if self._font is None:
 70            font = ctypes.create_string_buffer(255)
 71            flags = ctypes.c_int()
 72            pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags)
 73            self._font = (font.value.decode("utf-8"), flags.value)
 74        return self._font
 75
 76    @property
 77    def char(self) -> str:
 78        """The printable string of the unicode value."""
 79        char = chr(self.unicode)
 80        return char if char.isprintable() else ""
 81
 82    @cached_property
 83    def origin(self) -> Point:
 84        """The origin of the character."""
 85        x, y = ctypes.c_double(), ctypes.c_double()
 86        assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y)
 87        if self._page.rotation:
 88            return Point(y.value, self._page.height - x.value)
 89        return Point(x.value, y.value)
 90
 91    @cached_property
 92    def width(self) -> float:
 93        """The width of the character's bounding box."""
 94        if self.rotation:
 95            return self.bbox.height
 96        return self.bbox.width
 97
 98    @cached_property
 99    def height(self) -> float:
100        """The height of the character's bounding box."""
101        if self.rotation:
102            return self.bbox.width
103        return self.bbox.height
104
105    @cached_property
106    def tbbox(self) -> Rectangle:
107        """The tight bounding box of the character."""
108        tbbox = Rectangle(*self._text.get_charbox(self._index))
109        if self._page.rotation:
110            tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x,
111                              tbbox.p1.y, self._page.height - tbbox.p0.x)
112        return tbbox
113
114    @property
115    def bbox(self) -> Rectangle:
116        """
117        The loose bounding box of the character.
118        .. note::
119            If the loose bounding box is not available, the tight bounding box
120            is used instead.
121        """
122        if not self._bbox.width or not self._bbox.height:
123            return self.tbbox
124        return self._bbox
125
126    @cached_property
127    def twidth(self) -> float:
128        """The width of the character's tight bounding box."""
129        return self.tbbox.width
130
131    @cached_property
132    def theight(self) -> float:
133        """The height of the character's tight bounding box."""
134        return self.tbbox.height
135
136    @cached_property
137    def render_mode(self) -> RenderMode:
138        """The render mode of the character."""
139        return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))
140
141    @cached_property
142    def rotation(self) -> int:
143        """The rotation of the character in degrees modulo 360."""
144        # Special case for vertical text in rotated pages
145        if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xa, 0xd}:
146            return 90
147        if self._page.rotation and self._rotation:
148            return (self._page.rotation + self._rotation) % 360
149        return self._rotation
150
151    @cached_property
152    def size(self) -> float:
153        """The font size of the character."""
154        return pp.raw.FPDFText_GetFontSize(self._text, self._index)
155
156    @cached_property
157    def weight(self) -> int:
158        """The font weight of the character."""
159        return pp.raw.FPDFText_GetFontWeight(self._text, self._index)
160
161    @cached_property
162    def fill(self) -> int:
163        """The fill color of the character."""
164        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
165        pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a)
166        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
167
168    @cached_property
169    def stroke(self) -> int:
170        """The stroke color of the character."""
171        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
172        pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a)
173        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
174
175    @cached_property
176    def font(self) -> str:
177        """The font name of the character."""
178        return self._font_flags()[0]
179
180    @cached_property
181    def flags(self) -> int:
182        """The font flags of the character."""
183        return self._font_flags()[1]
184
185    def descr(self) -> str:
186        """Human-readable description of the character for debugging."""
187        char = chr(self.unicode)
188        if not char.isprintable():
189            char = hex(self.unicode)
190        return f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " \
191               f"{self.render_mode}, {self.font}, {hex(self.flags)}, " \
192               f"{self.fill}, {self.stroke}, {repr(self.bbox)})"
193
194    def __str__(self) -> str:
195        return self.char
196
197    def __repr__(self) -> str:
198        char = chr(self.unicode)
199        escape = {0xa: "\\n", 0xd: "\\r", 0x9: "\\t", 0x20: "␣"}
200        char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode))
201        return char
class Character:
 28class Character:
 29    """
 30    This class contains all information about a single character in the PDF
 31    page.
 32    """
 33    class RenderMode(Enum):
 34        """Tells the PDF viewer how to render this character glyph."""
 35        UNKNOWN = -1
 36        FILL = 0
 37        STROKE = 1
 38        FILL_STROKE = 2
 39        INVISIBLE = 3
 40        FILL_CLIP = 4
 41        STROKE_CLIP = 5
 42        FILL_STROKE_CLIP = 6
 43        CLIP = 7
 44
 45    def __init__(self, page: "modm_data.pdf.page.Page", index: int):
 46        """
 47        :param page: The page containing the character.
 48        :param index: The index of the character.
 49        """
 50        self._page = page
 51        self._text = page._text
 52        self._index = index
 53        self._font = None
 54        self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index)))
 55
 56        self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index)
 57        """The unicode value of the character."""
 58        self.objlink: "modm_data.pdf.link.ObjLink" = None
 59        """The object link of this character or `None`"""
 60        self.weblink: "modm_data.pdf.link.WebLink" = None
 61        """The web link of this character or `None`"""
 62
 63        bbox = Rectangle(*self._text.get_charbox(self._index, loose=True))
 64        if self._page.rotation:
 65            bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x,
 66                             bbox.p1.y, self._page.height - bbox.p0.x)
 67        self._bbox = bbox
 68
 69    def _font_flags(self) -> tuple[str, int]:
 70        if self._font is None:
 71            font = ctypes.create_string_buffer(255)
 72            flags = ctypes.c_int()
 73            pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags)
 74            self._font = (font.value.decode("utf-8"), flags.value)
 75        return self._font
 76
 77    @property
 78    def char(self) -> str:
 79        """The printable string of the unicode value."""
 80        char = chr(self.unicode)
 81        return char if char.isprintable() else ""
 82
 83    @cached_property
 84    def origin(self) -> Point:
 85        """The origin of the character."""
 86        x, y = ctypes.c_double(), ctypes.c_double()
 87        assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y)
 88        if self._page.rotation:
 89            return Point(y.value, self._page.height - x.value)
 90        return Point(x.value, y.value)
 91
 92    @cached_property
 93    def width(self) -> float:
 94        """The width of the character's bounding box."""
 95        if self.rotation:
 96            return self.bbox.height
 97        return self.bbox.width
 98
 99    @cached_property
100    def height(self) -> float:
101        """The height of the character's bounding box."""
102        if self.rotation:
103            return self.bbox.width
104        return self.bbox.height
105
106    @cached_property
107    def tbbox(self) -> Rectangle:
108        """The tight bounding box of the character."""
109        tbbox = Rectangle(*self._text.get_charbox(self._index))
110        if self._page.rotation:
111            tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x,
112                              tbbox.p1.y, self._page.height - tbbox.p0.x)
113        return tbbox
114
115    @property
116    def bbox(self) -> Rectangle:
117        """
118        The loose bounding box of the character.
119        .. note::
120            If the loose bounding box is not available, the tight bounding box
121            is used instead.
122        """
123        if not self._bbox.width or not self._bbox.height:
124            return self.tbbox
125        return self._bbox
126
127    @cached_property
128    def twidth(self) -> float:
129        """The width of the character's tight bounding box."""
130        return self.tbbox.width
131
132    @cached_property
133    def theight(self) -> float:
134        """The height of the character's tight bounding box."""
135        return self.tbbox.height
136
137    @cached_property
138    def render_mode(self) -> RenderMode:
139        """The render mode of the character."""
140        return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))
141
142    @cached_property
143    def rotation(self) -> int:
144        """The rotation of the character in degrees modulo 360."""
145        # Special case for vertical text in rotated pages
146        if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xa, 0xd}:
147            return 90
148        if self._page.rotation and self._rotation:
149            return (self._page.rotation + self._rotation) % 360
150        return self._rotation
151
152    @cached_property
153    def size(self) -> float:
154        """The font size of the character."""
155        return pp.raw.FPDFText_GetFontSize(self._text, self._index)
156
157    @cached_property
158    def weight(self) -> int:
159        """The font weight of the character."""
160        return pp.raw.FPDFText_GetFontWeight(self._text, self._index)
161
162    @cached_property
163    def fill(self) -> int:
164        """The fill color of the character."""
165        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
166        pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a)
167        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
168
169    @cached_property
170    def stroke(self) -> int:
171        """The stroke color of the character."""
172        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
173        pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a)
174        return r.value << 24 | g.value << 16 | b.value << 8 | a.value
175
176    @cached_property
177    def font(self) -> str:
178        """The font name of the character."""
179        return self._font_flags()[0]
180
181    @cached_property
182    def flags(self) -> int:
183        """The font flags of the character."""
184        return self._font_flags()[1]
185
186    def descr(self) -> str:
187        """Human-readable description of the character for debugging."""
188        char = chr(self.unicode)
189        if not char.isprintable():
190            char = hex(self.unicode)
191        return f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " \
192               f"{self.render_mode}, {self.font}, {hex(self.flags)}, " \
193               f"{self.fill}, {self.stroke}, {repr(self.bbox)})"
194
195    def __str__(self) -> str:
196        return self.char
197
198    def __repr__(self) -> str:
199        char = chr(self.unicode)
200        escape = {0xa: "\\n", 0xd: "\\r", 0x9: "\\t", 0x20: "␣"}
201        char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode))
202        return char

This class contains all information about a single character in the PDF page.

Character(page: modm_data.pdf.page.Page, index: int)
45    def __init__(self, page: "modm_data.pdf.page.Page", index: int):
46        """
47        :param page: The page containing the character.
48        :param index: The index of the character.
49        """
50        self._page = page
51        self._text = page._text
52        self._index = index
53        self._font = None
54        self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index)))
55
56        self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index)
57        """The unicode value of the character."""
58        self.objlink: "modm_data.pdf.link.ObjLink" = None
59        """The object link of this character or `None`"""
60        self.weblink: "modm_data.pdf.link.WebLink" = None
61        """The web link of this character or `None`"""
62
63        bbox = Rectangle(*self._text.get_charbox(self._index, loose=True))
64        if self._page.rotation:
65            bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x,
66                             bbox.p1.y, self._page.height - bbox.p0.x)
67        self._bbox = bbox
Parameters
  • page: The page containing the character.
  • index: The index of the character.
unicode: int

The unicode value of the character.

char: str
77    @property
78    def char(self) -> str:
79        """The printable string of the unicode value."""
80        char = chr(self.unicode)
81        return char if char.isprintable() else ""

The printable string of the unicode value.

origin: modm_data.utils.math.Point
83    @cached_property
84    def origin(self) -> Point:
85        """The origin of the character."""
86        x, y = ctypes.c_double(), ctypes.c_double()
87        assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y)
88        if self._page.rotation:
89            return Point(y.value, self._page.height - x.value)
90        return Point(x.value, y.value)

The origin of the character.

width: float
92    @cached_property
93    def width(self) -> float:
94        """The width of the character's bounding box."""
95        if self.rotation:
96            return self.bbox.height
97        return self.bbox.width

The width of the character's bounding box.

height: float
 99    @cached_property
100    def height(self) -> float:
101        """The height of the character's bounding box."""
102        if self.rotation:
103            return self.bbox.width
104        return self.bbox.height

The height of the character's bounding box.

tbbox: modm_data.utils.math.Rectangle
106    @cached_property
107    def tbbox(self) -> Rectangle:
108        """The tight bounding box of the character."""
109        tbbox = Rectangle(*self._text.get_charbox(self._index))
110        if self._page.rotation:
111            tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x,
112                              tbbox.p1.y, self._page.height - tbbox.p0.x)
113        return tbbox

The tight bounding box of the character.

bbox: modm_data.utils.math.Rectangle
115    @property
116    def bbox(self) -> Rectangle:
117        """
118        The loose bounding box of the character.
119        .. note::
120            If the loose bounding box is not available, the tight bounding box
121            is used instead.
122        """
123        if not self._bbox.width or not self._bbox.height:
124            return self.tbbox
125        return self._bbox

The loose bounding box of the character.

If the loose bounding box is not available, the tight bounding box is used instead.

twidth: float
127    @cached_property
128    def twidth(self) -> float:
129        """The width of the character's tight bounding box."""
130        return self.tbbox.width

The width of the character's tight bounding box.

theight: float
132    @cached_property
133    def theight(self) -> float:
134        """The height of the character's tight bounding box."""
135        return self.tbbox.height

The height of the character's tight bounding box.

render_mode: Character.RenderMode
137    @cached_property
138    def render_mode(self) -> RenderMode:
139        """The render mode of the character."""
140        return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))

The render mode of the character.

rotation: int
142    @cached_property
143    def rotation(self) -> int:
144        """The rotation of the character in degrees modulo 360."""
145        # Special case for vertical text in rotated pages
146        if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xa, 0xd}:
147            return 90
148        if self._page.rotation and self._rotation:
149            return (self._page.rotation + self._rotation) % 360
150        return self._rotation

The rotation of the character in degrees modulo 360.

size: float
152    @cached_property
153    def size(self) -> float:
154        """The font size of the character."""
155        return pp.raw.FPDFText_GetFontSize(self._text, self._index)

The font size of the character.

weight: int
157    @cached_property
158    def weight(self) -> int:
159        """The font weight of the character."""
160        return pp.raw.FPDFText_GetFontWeight(self._text, self._index)

The font weight of the character.

fill: int
162    @cached_property
163    def fill(self) -> int:
164        """The fill color of the character."""
165        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
166        pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a)
167        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The fill color of the character.

stroke: int
169    @cached_property
170    def stroke(self) -> int:
171        """The stroke color of the character."""
172        r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()
173        pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a)
174        return r.value << 24 | g.value << 16 | b.value << 8 | a.value

The stroke color of the character.

font: str
176    @cached_property
177    def font(self) -> str:
178        """The font name of the character."""
179        return self._font_flags()[0]

The font name of the character.

flags: int
181    @cached_property
182    def flags(self) -> int:
183        """The font flags of the character."""
184        return self._font_flags()[1]

The font flags of the character.

def descr(self) -> str:
186    def descr(self) -> str:
187        """Human-readable description of the character for debugging."""
188        char = chr(self.unicode)
189        if not char.isprintable():
190            char = hex(self.unicode)
191        return f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " \
192               f"{self.render_mode}, {self.font}, {hex(self.flags)}, " \
193               f"{self.fill}, {self.stroke}, {repr(self.bbox)})"

Human-readable description of the character for debugging.

class Character.RenderMode(enum.Enum):
33    class RenderMode(Enum):
34        """Tells the PDF viewer how to render this character glyph."""
35        UNKNOWN = -1
36        FILL = 0
37        STROKE = 1
38        FILL_STROKE = 2
39        INVISIBLE = 3
40        FILL_CLIP = 4
41        STROKE_CLIP = 5
42        FILL_STROKE_CLIP = 6
43        CLIP = 7

Tells the PDF viewer how to render this character glyph.

UNKNOWN = <RenderMode.UNKNOWN: -1>
FILL = <RenderMode.FILL: 0>
STROKE = <RenderMode.STROKE: 1>
FILL_STROKE = <RenderMode.FILL_STROKE: 2>
INVISIBLE = <RenderMode.INVISIBLE: 3>
FILL_CLIP = <RenderMode.FILL_CLIP: 4>
STROKE_CLIP = <RenderMode.STROKE_CLIP: 5>
FILL_STROKE_CLIP = <RenderMode.FILL_STROKE_CLIP: 6>
CLIP = <RenderMode.CLIP: 7>
Inherited Members
enum.Enum
name
value