modm_data.pdf.character
PDF Characters
Each character on the PDF page is represented by a character object, describing exactly where and how to render the associated glyph.
While there are font flags, PDF files typically use entirely different fonts to render normal, bold, and italic characters.
The character's loose bounding box may not always be available, since it must be explicitly provided by the font. The tight bounding box is only available as long as the glyph is renderable, so a space character may have a loose, but not a tight bounding box, or none at all.
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF Characters 6 7Each character on the PDF page is represented by a character object, describing 8exactly where and how to render the associated glyph. 9 10While there are font flags, PDF files typically use entirely different fonts to 11render normal, bold, and italic characters. 12 13The character's loose bounding box may not always be available, since it must be 14explicitly provided by the font. The tight bounding box is only available as 15long as the glyph is renderable, so a space character may have a loose, but not 16a tight bounding box, or none at all. 17""" 18 19import math 20import ctypes 21from functools import cached_property 22from enum import Enum 23import pypdfium2 as pp 24from ..utils import Rectangle, Point 25 26 27class Character: 28 """ 29 This class contains all information about a single character in the PDF 30 page. 31 """ 32 class RenderMode(Enum): 33 """Tells the PDF viewer how to render this character glyph.""" 34 UNKNOWN = -1 35 FILL = 0 36 STROKE = 1 37 FILL_STROKE = 2 38 INVISIBLE = 3 39 FILL_CLIP = 4 40 STROKE_CLIP = 5 41 FILL_STROKE_CLIP = 6 42 CLIP = 7 43 44 def __init__(self, page: "modm_data.pdf.page.Page", index: int): 45 """ 46 :param page: The page containing the character. 47 :param index: The index of the character. 48 """ 49 self._page = page 50 self._text = page._text 51 self._index = index 52 self._font = None 53 self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index))) 54 55 self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index) 56 """The unicode value of the character.""" 57 self.objlink: "modm_data.pdf.link.ObjLink" = None 58 """The object link of this character or `None`""" 59 self.weblink: "modm_data.pdf.link.WebLink" = None 60 """The web link of this character or `None`""" 61 62 bbox = Rectangle(*self._text.get_charbox(self._index, loose=True)) 63 if self._page.rotation: 64 bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, 65 bbox.p1.y, self._page.height - bbox.p0.x) 66 self._bbox = bbox 67 68 def _font_flags(self) -> tuple[str, int]: 69 if self._font is None: 70 font = ctypes.create_string_buffer(255) 71 flags = ctypes.c_int() 72 pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags) 73 self._font = (font.value.decode("utf-8"), flags.value) 74 return self._font 75 76 @property 77 def char(self) -> str: 78 """The printable string of the unicode value.""" 79 char = chr(self.unicode) 80 return char if char.isprintable() else "" 81 82 @cached_property 83 def origin(self) -> Point: 84 """The origin of the character.""" 85 x, y = ctypes.c_double(), ctypes.c_double() 86 assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y) 87 if self._page.rotation: 88 return Point(y.value, self._page.height - x.value) 89 return Point(x.value, y.value) 90 91 @cached_property 92 def width(self) -> float: 93 """The width of the character's bounding box.""" 94 if self.rotation: 95 return self.bbox.height 96 return self.bbox.width 97 98 @cached_property 99 def height(self) -> float: 100 """The height of the character's bounding box.""" 101 if self.rotation: 102 return self.bbox.width 103 return self.bbox.height 104 105 @cached_property 106 def tbbox(self) -> Rectangle: 107 """The tight bounding box of the character.""" 108 tbbox = Rectangle(*self._text.get_charbox(self._index)) 109 if self._page.rotation: 110 tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, 111 tbbox.p1.y, self._page.height - tbbox.p0.x) 112 return tbbox 113 114 @property 115 def bbox(self) -> Rectangle: 116 """ 117 The loose bounding box of the character. 118 .. note:: 119 If the loose bounding box is not available, the tight bounding box 120 is used instead. 121 """ 122 if not self._bbox.width or not self._bbox.height: 123 return self.tbbox 124 return self._bbox 125 126 @cached_property 127 def twidth(self) -> float: 128 """The width of the character's tight bounding box.""" 129 return self.tbbox.width 130 131 @cached_property 132 def theight(self) -> float: 133 """The height of the character's tight bounding box.""" 134 return self.tbbox.height 135 136 @cached_property 137 def render_mode(self) -> RenderMode: 138 """The render mode of the character.""" 139 return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index)) 140 141 @cached_property 142 def rotation(self) -> int: 143 """The rotation of the character in degrees modulo 360.""" 144 # Special case for vertical text in rotated pages 145 if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xa, 0xd}: 146 return 90 147 if self._page.rotation and self._rotation: 148 return (self._page.rotation + self._rotation) % 360 149 return self._rotation 150 151 @cached_property 152 def size(self) -> float: 153 """The font size of the character.""" 154 return pp.raw.FPDFText_GetFontSize(self._text, self._index) 155 156 @cached_property 157 def weight(self) -> int: 158 """The font weight of the character.""" 159 return pp.raw.FPDFText_GetFontWeight(self._text, self._index) 160 161 @cached_property 162 def fill(self) -> int: 163 """The fill color of the character.""" 164 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 165 pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a) 166 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 167 168 @cached_property 169 def stroke(self) -> int: 170 """The stroke color of the character.""" 171 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 172 pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a) 173 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 174 175 @cached_property 176 def font(self) -> str: 177 """The font name of the character.""" 178 return self._font_flags()[0] 179 180 @cached_property 181 def flags(self) -> int: 182 """The font flags of the character.""" 183 return self._font_flags()[1] 184 185 def descr(self) -> str: 186 """Human-readable description of the character for debugging.""" 187 char = chr(self.unicode) 188 if not char.isprintable(): 189 char = hex(self.unicode) 190 return f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " \ 191 f"{self.render_mode}, {self.font}, {hex(self.flags)}, " \ 192 f"{self.fill}, {self.stroke}, {repr(self.bbox)})" 193 194 def __str__(self) -> str: 195 return self.char 196 197 def __repr__(self) -> str: 198 char = chr(self.unicode) 199 escape = {0xa: "\\n", 0xd: "\\r", 0x9: "\\t", 0x20: "␣"} 200 char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode)) 201 return char
28class Character: 29 """ 30 This class contains all information about a single character in the PDF 31 page. 32 """ 33 class RenderMode(Enum): 34 """Tells the PDF viewer how to render this character glyph.""" 35 UNKNOWN = -1 36 FILL = 0 37 STROKE = 1 38 FILL_STROKE = 2 39 INVISIBLE = 3 40 FILL_CLIP = 4 41 STROKE_CLIP = 5 42 FILL_STROKE_CLIP = 6 43 CLIP = 7 44 45 def __init__(self, page: "modm_data.pdf.page.Page", index: int): 46 """ 47 :param page: The page containing the character. 48 :param index: The index of the character. 49 """ 50 self._page = page 51 self._text = page._text 52 self._index = index 53 self._font = None 54 self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index))) 55 56 self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index) 57 """The unicode value of the character.""" 58 self.objlink: "modm_data.pdf.link.ObjLink" = None 59 """The object link of this character or `None`""" 60 self.weblink: "modm_data.pdf.link.WebLink" = None 61 """The web link of this character or `None`""" 62 63 bbox = Rectangle(*self._text.get_charbox(self._index, loose=True)) 64 if self._page.rotation: 65 bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, 66 bbox.p1.y, self._page.height - bbox.p0.x) 67 self._bbox = bbox 68 69 def _font_flags(self) -> tuple[str, int]: 70 if self._font is None: 71 font = ctypes.create_string_buffer(255) 72 flags = ctypes.c_int() 73 pp.raw.FPDFText_GetFontInfo(self._text, self._index, font, 255, flags) 74 self._font = (font.value.decode("utf-8"), flags.value) 75 return self._font 76 77 @property 78 def char(self) -> str: 79 """The printable string of the unicode value.""" 80 char = chr(self.unicode) 81 return char if char.isprintable() else "" 82 83 @cached_property 84 def origin(self) -> Point: 85 """The origin of the character.""" 86 x, y = ctypes.c_double(), ctypes.c_double() 87 assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y) 88 if self._page.rotation: 89 return Point(y.value, self._page.height - x.value) 90 return Point(x.value, y.value) 91 92 @cached_property 93 def width(self) -> float: 94 """The width of the character's bounding box.""" 95 if self.rotation: 96 return self.bbox.height 97 return self.bbox.width 98 99 @cached_property 100 def height(self) -> float: 101 """The height of the character's bounding box.""" 102 if self.rotation: 103 return self.bbox.width 104 return self.bbox.height 105 106 @cached_property 107 def tbbox(self) -> Rectangle: 108 """The tight bounding box of the character.""" 109 tbbox = Rectangle(*self._text.get_charbox(self._index)) 110 if self._page.rotation: 111 tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, 112 tbbox.p1.y, self._page.height - tbbox.p0.x) 113 return tbbox 114 115 @property 116 def bbox(self) -> Rectangle: 117 """ 118 The loose bounding box of the character. 119 .. note:: 120 If the loose bounding box is not available, the tight bounding box 121 is used instead. 122 """ 123 if not self._bbox.width or not self._bbox.height: 124 return self.tbbox 125 return self._bbox 126 127 @cached_property 128 def twidth(self) -> float: 129 """The width of the character's tight bounding box.""" 130 return self.tbbox.width 131 132 @cached_property 133 def theight(self) -> float: 134 """The height of the character's tight bounding box.""" 135 return self.tbbox.height 136 137 @cached_property 138 def render_mode(self) -> RenderMode: 139 """The render mode of the character.""" 140 return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index)) 141 142 @cached_property 143 def rotation(self) -> int: 144 """The rotation of the character in degrees modulo 360.""" 145 # Special case for vertical text in rotated pages 146 if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xa, 0xd}: 147 return 90 148 if self._page.rotation and self._rotation: 149 return (self._page.rotation + self._rotation) % 360 150 return self._rotation 151 152 @cached_property 153 def size(self) -> float: 154 """The font size of the character.""" 155 return pp.raw.FPDFText_GetFontSize(self._text, self._index) 156 157 @cached_property 158 def weight(self) -> int: 159 """The font weight of the character.""" 160 return pp.raw.FPDFText_GetFontWeight(self._text, self._index) 161 162 @cached_property 163 def fill(self) -> int: 164 """The fill color of the character.""" 165 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 166 pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a) 167 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 168 169 @cached_property 170 def stroke(self) -> int: 171 """The stroke color of the character.""" 172 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 173 pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a) 174 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 175 176 @cached_property 177 def font(self) -> str: 178 """The font name of the character.""" 179 return self._font_flags()[0] 180 181 @cached_property 182 def flags(self) -> int: 183 """The font flags of the character.""" 184 return self._font_flags()[1] 185 186 def descr(self) -> str: 187 """Human-readable description of the character for debugging.""" 188 char = chr(self.unicode) 189 if not char.isprintable(): 190 char = hex(self.unicode) 191 return f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " \ 192 f"{self.render_mode}, {self.font}, {hex(self.flags)}, " \ 193 f"{self.fill}, {self.stroke}, {repr(self.bbox)})" 194 195 def __str__(self) -> str: 196 return self.char 197 198 def __repr__(self) -> str: 199 char = chr(self.unicode) 200 escape = {0xa: "\\n", 0xd: "\\r", 0x9: "\\t", 0x20: "␣"} 201 char = escape.get(self.unicode, char if char.isprintable() else hex(self.unicode)) 202 return char
This class contains all information about a single character in the PDF page.
45 def __init__(self, page: "modm_data.pdf.page.Page", index: int): 46 """ 47 :param page: The page containing the character. 48 :param index: The index of the character. 49 """ 50 self._page = page 51 self._text = page._text 52 self._index = index 53 self._font = None 54 self._rotation = int(math.degrees(pp.raw.FPDFText_GetCharAngle(self._text, self._index))) 55 56 self.unicode: int = pp.raw.FPDFText_GetUnicode(self._text, self._index) 57 """The unicode value of the character.""" 58 self.objlink: "modm_data.pdf.link.ObjLink" = None 59 """The object link of this character or `None`""" 60 self.weblink: "modm_data.pdf.link.WebLink" = None 61 """The web link of this character or `None`""" 62 63 bbox = Rectangle(*self._text.get_charbox(self._index, loose=True)) 64 if self._page.rotation: 65 bbox = Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, 66 bbox.p1.y, self._page.height - bbox.p0.x) 67 self._bbox = bbox
Parameters
- page: The page containing the character.
- index: The index of the character.
77 @property 78 def char(self) -> str: 79 """The printable string of the unicode value.""" 80 char = chr(self.unicode) 81 return char if char.isprintable() else ""
The printable string of the unicode value.
83 @cached_property 84 def origin(self) -> Point: 85 """The origin of the character.""" 86 x, y = ctypes.c_double(), ctypes.c_double() 87 assert pp.raw.FPDFText_GetCharOrigin(self._text, self._index, x, y) 88 if self._page.rotation: 89 return Point(y.value, self._page.height - x.value) 90 return Point(x.value, y.value)
The origin of the character.
92 @cached_property 93 def width(self) -> float: 94 """The width of the character's bounding box.""" 95 if self.rotation: 96 return self.bbox.height 97 return self.bbox.width
The width of the character's bounding box.
99 @cached_property 100 def height(self) -> float: 101 """The height of the character's bounding box.""" 102 if self.rotation: 103 return self.bbox.width 104 return self.bbox.height
The height of the character's bounding box.
106 @cached_property 107 def tbbox(self) -> Rectangle: 108 """The tight bounding box of the character.""" 109 tbbox = Rectangle(*self._text.get_charbox(self._index)) 110 if self._page.rotation: 111 tbbox = Rectangle(tbbox.p0.y, self._page.height - tbbox.p1.x, 112 tbbox.p1.y, self._page.height - tbbox.p0.x) 113 return tbbox
The tight bounding box of the character.
115 @property 116 def bbox(self) -> Rectangle: 117 """ 118 The loose bounding box of the character. 119 .. note:: 120 If the loose bounding box is not available, the tight bounding box 121 is used instead. 122 """ 123 if not self._bbox.width or not self._bbox.height: 124 return self.tbbox 125 return self._bbox
The loose bounding box of the character.
If the loose bounding box is not available, the tight bounding box is used instead.
127 @cached_property 128 def twidth(self) -> float: 129 """The width of the character's tight bounding box.""" 130 return self.tbbox.width
The width of the character's tight bounding box.
132 @cached_property 133 def theight(self) -> float: 134 """The height of the character's tight bounding box.""" 135 return self.tbbox.height
The height of the character's tight bounding box.
137 @cached_property 138 def render_mode(self) -> RenderMode: 139 """The render mode of the character.""" 140 return Character.RenderMode(pp.raw.FPDFText_GetTextRenderMode(self._text, self._index))
The render mode of the character.
142 @cached_property 143 def rotation(self) -> int: 144 """The rotation of the character in degrees modulo 360.""" 145 # Special case for vertical text in rotated pages 146 if self._page.rotation == 90 and self._rotation == 0 and self.unicode not in {0x20, 0xa, 0xd}: 147 return 90 148 if self._page.rotation and self._rotation: 149 return (self._page.rotation + self._rotation) % 360 150 return self._rotation
The rotation of the character in degrees modulo 360.
152 @cached_property 153 def size(self) -> float: 154 """The font size of the character.""" 155 return pp.raw.FPDFText_GetFontSize(self._text, self._index)
The font size of the character.
157 @cached_property 158 def weight(self) -> int: 159 """The font weight of the character.""" 160 return pp.raw.FPDFText_GetFontWeight(self._text, self._index)
The font weight of the character.
162 @cached_property 163 def fill(self) -> int: 164 """The fill color of the character.""" 165 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 166 pp.raw.FPDFText_GetFillColor(self._text, self._index, r, g, b, a) 167 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The fill color of the character.
169 @cached_property 170 def stroke(self) -> int: 171 """The stroke color of the character.""" 172 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 173 pp.raw.FPDFText_GetStrokeColor(self._text, self._index, r, g, b, a) 174 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The stroke color of the character.
176 @cached_property 177 def font(self) -> str: 178 """The font name of the character.""" 179 return self._font_flags()[0]
The font name of the character.
181 @cached_property 182 def flags(self) -> int: 183 """The font flags of the character.""" 184 return self._font_flags()[1]
The font flags of the character.
186 def descr(self) -> str: 187 """Human-readable description of the character for debugging.""" 188 char = chr(self.unicode) 189 if not char.isprintable(): 190 char = hex(self.unicode) 191 return f"Chr({char}, {self.size}, {self.weight}, {self.rotation}, " \ 192 f"{self.render_mode}, {self.font}, {hex(self.flags)}, " \ 193 f"{self.fill}, {self.stroke}, {repr(self.bbox)})"
Human-readable description of the character for debugging.
33 class RenderMode(Enum): 34 """Tells the PDF viewer how to render this character glyph.""" 35 UNKNOWN = -1 36 FILL = 0 37 STROKE = 1 38 FILL_STROKE = 2 39 INVISIBLE = 3 40 FILL_CLIP = 4 41 STROKE_CLIP = 5 42 FILL_STROKE_CLIP = 6 43 CLIP = 7
Tells the PDF viewer how to render this character glyph.
Inherited Members
- enum.Enum
- name
- value