modm_data.pdf.graphics
PDF Graphics
PDF uses a subset of the PostScript graphics language, which draws vector paths with various rendering options. We are only interested in the basic properties, in particular, for recognizing table cell borders.
In addition, images support bitmap data.
1# Copyright 2022, Niklas Hauser 2# SPDX-License-Identifier: MPL-2.0 3 4""" 5# PDF Graphics 6 7PDF uses a subset of the PostScript graphics language, which draws vector paths 8with various rendering options. We are only interested in the basic properties, 9in particular, for recognizing table cell borders. 10 11In addition, images support bitmap data. 12""" 13 14import ctypes 15from functools import cached_property 16from enum import Enum 17import pypdfium2 as pp 18from ..utils import Point, Rectangle, Line 19 20 21class Path(pp.PdfObject): 22 """ 23 This class specializes `pypdfium2.PdfObject` to add accessors for graphics 24 containing vector paths of various configurations. 25 26 You must construct the paths by calling `modm_data.pdf.page.Page.paths`. 27 """ 28 class Type(Enum): 29 """Path Type""" 30 LINE = 0 31 BEZIER = 1 32 MOVE = 2 33 34 class Cap(Enum): 35 """Path Cap Type""" 36 BUTT = 0 37 ROUND = 1 38 PROJECTING_SQUARE = 2 39 40 class Join(Enum): 41 """Path Join Type""" 42 MITER = 0 43 ROUND = 1 44 BEVEL = 2 45 46 # Overwrite the PdfPageObject.__new__ function 47 def __new__(cls, *args, **kwargs): 48 return object.__new__(cls) 49 50 def __init__(self, obj): 51 """ 52 :param obj: PDF object of the path. 53 """ 54 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 55 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH 56 self.type = pp.raw.FPDF_PAGEOBJ_PATH 57 58 @cached_property 59 def matrix(self) -> pp.PdfMatrix: 60 """The transformation matrix.""" 61 return self.get_matrix() 62 63 @cached_property 64 def count(self) -> int: 65 """Number of segments in this path.""" 66 return pp.raw.FPDFPath_CountSegments(self) 67 68 @cached_property 69 def fill(self) -> int: 70 """The fill color encoded as 32-bit RGBA.""" 71 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 72 assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a) 73 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 74 75 @cached_property 76 def stroke(self) -> int: 77 """The stroke color encoded as 32-bit RGBA.""" 78 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 79 assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a) 80 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 81 82 @cached_property 83 def width(self) -> float: 84 """The stroke width.""" 85 width = ctypes.c_float() 86 assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width) 87 return width.value 88 89 @cached_property 90 def cap(self) -> Cap: 91 """Line cap type.""" 92 return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self)) 93 94 @cached_property 95 def join(self) -> Join: 96 """Line join type.""" 97 return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self)) 98 99 @cached_property 100 def bbox(self) -> Rectangle: 101 """ 102 Bounding box of the path. 103 .. warning:: 104 The bounding is only approximated using the control points! 105 Therefore bezier curves will likely have a larger bounding box. 106 """ 107 l, b = ctypes.c_float(), ctypes.c_float() 108 r, t = ctypes.c_float(), ctypes.c_float() 109 assert pp.raw.FPDFPageObj_GetBounds(self, l, b, r, t) 110 bbox = Rectangle(l.value, b.value, r.value, t.value) 111 if self.page.rotation: 112 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, 113 bbox.p1.y, self.page.height - bbox.p0.x) 114 return bbox 115 116 @cached_property 117 def points(self) -> list[Point]: 118 """ 119 List of points of the path. If the path is closed, the first point is 120 added to the end of the list. 121 """ 122 points = [] 123 for ii in range(self.count): 124 seg = pp.raw.FPDFPath_GetPathSegment(self, ii) 125 ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg)) 126 # The first point should always be MOVETO 127 assert ii or ptype == Path.Type.MOVE 128 129 x, y = ctypes.c_float(), ctypes.c_float() 130 assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y) 131 x, y = self.matrix.on_point(x.value, y.value) 132 points.append(Point(x, y, type=ptype)) 133 134 if pp.raw.FPDFPathSegment_GetClose(seg): 135 points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE)) 136 137 if self.page.rotation: 138 points = [Point(y, self.page.height - x, type=p.type) for p in points] 139 return points 140 141 @cached_property 142 def lines(self) -> list[Line]: 143 """List of lines between the path points.""" 144 points = self.points 145 return [Line(points[ii], points[ii + 1], width=self.width, 146 type=points[ii + 1].type) for ii in range(len(points) - 1)] 147 148 def __repr__(self) -> str: 149 points = ",".join(repr(p) for p in self.points) 150 return f"P{self.count}={points}" 151 152 153class Image(pp.PdfImage): 154 """ 155 This class extends `pypdfium2.PdfImage` to align it with the interface of 156 the `Path` class so that it can be used in the same 157 algorithms without filtering. 158 159 You must construct the images by calling `modm_data.pdf.page.Page.images`. 160 161 .. note:: Images are currently ignored. 162 """ 163 # Overwrite the PdfPageObject.__new__ function 164 def __new__(cls, *args, **kwargs): 165 return object.__new__(cls) 166 167 def __init__(self, obj): 168 """ 169 :param obj: Page object of the image. 170 """ 171 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 172 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE 173 self.type = pp.raw.FPDF_PAGEOBJ_IMAGE 174 175 self.count: int = 4 176 """Number of segments. Always 4 due to rectangular image form. 177 (For compatibility with `Path.count`.)""" 178 self.stroke: int = 0 179 """The border stroke color. Always 0. 180 (For compatibility with `Path.stroke`.)""" 181 self.fill: int = 0 182 """The image fill color. Always 0. 183 (For compatibility with `Path.fill`.)""" 184 self.width: float = 0 185 """The border line width. Always 0. 186 (For compatibility with `Path.width`.)""" 187 188 @cached_property 189 def matrix(self) -> pp.PdfMatrix: 190 """The transformation matrix.""" 191 return self.get_matrix() 192 193 @cached_property 194 def bbox(self) -> Rectangle: 195 """The bounding box of the image.""" 196 bbox = Rectangle(*self.get_pos()) 197 if self.page.rotation: 198 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, 199 bbox.p1.y, self.page.height - bbox.p0.x) 200 return bbox 201 202 @cached_property 203 def points(self) -> list[Point]: 204 """ 205 The 4 points of the bounding box. 206 (For compatibility with `Path.points`.) 207 """ 208 points = self.bbox.points 209 if self.page.rotation: 210 points = [Point(p.y, self.page.height - p.x, p.type) for p in points] 211 return points 212 213 @cached_property 214 def lines(self) -> list[Line]: 215 """ 216 The 4 lines of the bounding box. 217 (For compatibility with `Path.lines`.) 218 """ 219 p = self.points 220 return [Line(p[0], p[1], p[1].type, 0), Line(p[1], p[2], p[2].type, 0), 221 Line(p[2], p[3], p[3].type, 0), Line(p[3], p[0], p[0].type, 0)] 222 223 def __repr__(self) -> str: 224 return f"I{self.bbox}"
22class Path(pp.PdfObject): 23 """ 24 This class specializes `pypdfium2.PdfObject` to add accessors for graphics 25 containing vector paths of various configurations. 26 27 You must construct the paths by calling `modm_data.pdf.page.Page.paths`. 28 """ 29 class Type(Enum): 30 """Path Type""" 31 LINE = 0 32 BEZIER = 1 33 MOVE = 2 34 35 class Cap(Enum): 36 """Path Cap Type""" 37 BUTT = 0 38 ROUND = 1 39 PROJECTING_SQUARE = 2 40 41 class Join(Enum): 42 """Path Join Type""" 43 MITER = 0 44 ROUND = 1 45 BEVEL = 2 46 47 # Overwrite the PdfPageObject.__new__ function 48 def __new__(cls, *args, **kwargs): 49 return object.__new__(cls) 50 51 def __init__(self, obj): 52 """ 53 :param obj: PDF object of the path. 54 """ 55 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 56 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH 57 self.type = pp.raw.FPDF_PAGEOBJ_PATH 58 59 @cached_property 60 def matrix(self) -> pp.PdfMatrix: 61 """The transformation matrix.""" 62 return self.get_matrix() 63 64 @cached_property 65 def count(self) -> int: 66 """Number of segments in this path.""" 67 return pp.raw.FPDFPath_CountSegments(self) 68 69 @cached_property 70 def fill(self) -> int: 71 """The fill color encoded as 32-bit RGBA.""" 72 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 73 assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a) 74 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 75 76 @cached_property 77 def stroke(self) -> int: 78 """The stroke color encoded as 32-bit RGBA.""" 79 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 80 assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a) 81 return r.value << 24 | g.value << 16 | b.value << 8 | a.value 82 83 @cached_property 84 def width(self) -> float: 85 """The stroke width.""" 86 width = ctypes.c_float() 87 assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width) 88 return width.value 89 90 @cached_property 91 def cap(self) -> Cap: 92 """Line cap type.""" 93 return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self)) 94 95 @cached_property 96 def join(self) -> Join: 97 """Line join type.""" 98 return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self)) 99 100 @cached_property 101 def bbox(self) -> Rectangle: 102 """ 103 Bounding box of the path. 104 .. warning:: 105 The bounding is only approximated using the control points! 106 Therefore bezier curves will likely have a larger bounding box. 107 """ 108 l, b = ctypes.c_float(), ctypes.c_float() 109 r, t = ctypes.c_float(), ctypes.c_float() 110 assert pp.raw.FPDFPageObj_GetBounds(self, l, b, r, t) 111 bbox = Rectangle(l.value, b.value, r.value, t.value) 112 if self.page.rotation: 113 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, 114 bbox.p1.y, self.page.height - bbox.p0.x) 115 return bbox 116 117 @cached_property 118 def points(self) -> list[Point]: 119 """ 120 List of points of the path. If the path is closed, the first point is 121 added to the end of the list. 122 """ 123 points = [] 124 for ii in range(self.count): 125 seg = pp.raw.FPDFPath_GetPathSegment(self, ii) 126 ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg)) 127 # The first point should always be MOVETO 128 assert ii or ptype == Path.Type.MOVE 129 130 x, y = ctypes.c_float(), ctypes.c_float() 131 assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y) 132 x, y = self.matrix.on_point(x.value, y.value) 133 points.append(Point(x, y, type=ptype)) 134 135 if pp.raw.FPDFPathSegment_GetClose(seg): 136 points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE)) 137 138 if self.page.rotation: 139 points = [Point(y, self.page.height - x, type=p.type) for p in points] 140 return points 141 142 @cached_property 143 def lines(self) -> list[Line]: 144 """List of lines between the path points.""" 145 points = self.points 146 return [Line(points[ii], points[ii + 1], width=self.width, 147 type=points[ii + 1].type) for ii in range(len(points) - 1)] 148 149 def __repr__(self) -> str: 150 points = ",".join(repr(p) for p in self.points) 151 return f"P{self.count}={points}"
This class specializes pypdfium2.PdfObject
to add accessors for graphics
containing vector paths of various configurations.
You must construct the paths by calling modm_data.pdf.page.Page.paths
.
51 def __init__(self, obj): 52 """ 53 :param obj: PDF object of the path. 54 """ 55 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 56 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_PATH 57 self.type = pp.raw.FPDF_PAGEOBJ_PATH
Parameters
- obj: PDF object of the path.
59 @cached_property 60 def matrix(self) -> pp.PdfMatrix: 61 """The transformation matrix.""" 62 return self.get_matrix()
The transformation matrix.
64 @cached_property 65 def count(self) -> int: 66 """Number of segments in this path.""" 67 return pp.raw.FPDFPath_CountSegments(self)
Number of segments in this path.
69 @cached_property 70 def fill(self) -> int: 71 """The fill color encoded as 32-bit RGBA.""" 72 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 73 assert pp.raw.FPDFPageObj_GetFillColor(self, r, g, b, a) 74 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The fill color encoded as 32-bit RGBA.
76 @cached_property 77 def stroke(self) -> int: 78 """The stroke color encoded as 32-bit RGBA.""" 79 r, g, b, a = ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint() 80 assert pp.raw.FPDFPageObj_GetStrokeColor(self, r, g, b, a) 81 return r.value << 24 | g.value << 16 | b.value << 8 | a.value
The stroke color encoded as 32-bit RGBA.
83 @cached_property 84 def width(self) -> float: 85 """The stroke width.""" 86 width = ctypes.c_float() 87 assert pp.raw.FPDFPageObj_GetStrokeWidth(self, width) 88 return width.value
The stroke width.
90 @cached_property 91 def cap(self) -> Cap: 92 """Line cap type.""" 93 return Path.Cap(pp.raw.FPDFPageObj_GetLineCap(self))
Line cap type.
95 @cached_property 96 def join(self) -> Join: 97 """Line join type.""" 98 return Path.Join(pp.raw.FPDFPageObj_GetLineJoin(self))
Line join type.
100 @cached_property 101 def bbox(self) -> Rectangle: 102 """ 103 Bounding box of the path. 104 .. warning:: 105 The bounding is only approximated using the control points! 106 Therefore bezier curves will likely have a larger bounding box. 107 """ 108 l, b = ctypes.c_float(), ctypes.c_float() 109 r, t = ctypes.c_float(), ctypes.c_float() 110 assert pp.raw.FPDFPageObj_GetBounds(self, l, b, r, t) 111 bbox = Rectangle(l.value, b.value, r.value, t.value) 112 if self.page.rotation: 113 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, 114 bbox.p1.y, self.page.height - bbox.p0.x) 115 return bbox
Bounding box of the path.
The bounding is only approximated using the control points! Therefore bezier curves will likely have a larger bounding box.
117 @cached_property 118 def points(self) -> list[Point]: 119 """ 120 List of points of the path. If the path is closed, the first point is 121 added to the end of the list. 122 """ 123 points = [] 124 for ii in range(self.count): 125 seg = pp.raw.FPDFPath_GetPathSegment(self, ii) 126 ptype = Path.Type(pp.raw.FPDFPathSegment_GetType(seg)) 127 # The first point should always be MOVETO 128 assert ii or ptype == Path.Type.MOVE 129 130 x, y = ctypes.c_float(), ctypes.c_float() 131 assert pp.raw.FPDFPathSegment_GetPoint(seg, x, y) 132 x, y = self.matrix.on_point(x.value, y.value) 133 points.append(Point(x, y, type=ptype)) 134 135 if pp.raw.FPDFPathSegment_GetClose(seg): 136 points.append(Point(points[0].x, points[0].y, type=Path.Type.LINE)) 137 138 if self.page.rotation: 139 points = [Point(y, self.page.height - x, type=p.type) for p in points] 140 return points
List of points of the path. If the path is closed, the first point is added to the end of the list.
142 @cached_property 143 def lines(self) -> list[Line]: 144 """List of lines between the path points.""" 145 points = self.points 146 return [Line(points[ii], points[ii + 1], width=self.width, 147 type=points[ii + 1].type) for ii in range(len(points) - 1)]
List of lines between the path points.
Inherited Members
- pypdfium2._helpers.pageobjects.PdfObject
- parent
- get_pos
- get_matrix
- set_matrix
- transform
- pypdfium2.internal.bases.AutoCloseable
- close
Path Type
Inherited Members
- enum.Enum
- name
- value
Path Cap Type
Inherited Members
- enum.Enum
- name
- value
Path Join Type
Inherited Members
- enum.Enum
- name
- value
154class Image(pp.PdfImage): 155 """ 156 This class extends `pypdfium2.PdfImage` to align it with the interface of 157 the `Path` class so that it can be used in the same 158 algorithms without filtering. 159 160 You must construct the images by calling `modm_data.pdf.page.Page.images`. 161 162 .. note:: Images are currently ignored. 163 """ 164 # Overwrite the PdfPageObject.__new__ function 165 def __new__(cls, *args, **kwargs): 166 return object.__new__(cls) 167 168 def __init__(self, obj): 169 """ 170 :param obj: Page object of the image. 171 """ 172 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 173 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE 174 self.type = pp.raw.FPDF_PAGEOBJ_IMAGE 175 176 self.count: int = 4 177 """Number of segments. Always 4 due to rectangular image form. 178 (For compatibility with `Path.count`.)""" 179 self.stroke: int = 0 180 """The border stroke color. Always 0. 181 (For compatibility with `Path.stroke`.)""" 182 self.fill: int = 0 183 """The image fill color. Always 0. 184 (For compatibility with `Path.fill`.)""" 185 self.width: float = 0 186 """The border line width. Always 0. 187 (For compatibility with `Path.width`.)""" 188 189 @cached_property 190 def matrix(self) -> pp.PdfMatrix: 191 """The transformation matrix.""" 192 return self.get_matrix() 193 194 @cached_property 195 def bbox(self) -> Rectangle: 196 """The bounding box of the image.""" 197 bbox = Rectangle(*self.get_pos()) 198 if self.page.rotation: 199 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, 200 bbox.p1.y, self.page.height - bbox.p0.x) 201 return bbox 202 203 @cached_property 204 def points(self) -> list[Point]: 205 """ 206 The 4 points of the bounding box. 207 (For compatibility with `Path.points`.) 208 """ 209 points = self.bbox.points 210 if self.page.rotation: 211 points = [Point(p.y, self.page.height - p.x, p.type) for p in points] 212 return points 213 214 @cached_property 215 def lines(self) -> list[Line]: 216 """ 217 The 4 lines of the bounding box. 218 (For compatibility with `Path.lines`.) 219 """ 220 p = self.points 221 return [Line(p[0], p[1], p[1].type, 0), Line(p[1], p[2], p[2].type, 0), 222 Line(p[2], p[3], p[3].type, 0), Line(p[3], p[0], p[0].type, 0)] 223 224 def __repr__(self) -> str: 225 return f"I{self.bbox}"
This class extends pypdfium2.PdfImage
to align it with the interface of
the Path
class so that it can be used in the same
algorithms without filtering.
You must construct the images by calling modm_data.pdf.page.Page.images
.
Images are currently ignored.
168 def __init__(self, obj): 169 """ 170 :param obj: Page object of the image. 171 """ 172 super().__init__(obj.raw, obj.page, obj.pdf, obj.level) 173 assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE 174 self.type = pp.raw.FPDF_PAGEOBJ_IMAGE 175 176 self.count: int = 4 177 """Number of segments. Always 4 due to rectangular image form. 178 (For compatibility with `Path.count`.)""" 179 self.stroke: int = 0 180 """The border stroke color. Always 0. 181 (For compatibility with `Path.stroke`.)""" 182 self.fill: int = 0 183 """The image fill color. Always 0. 184 (For compatibility with `Path.fill`.)""" 185 self.width: float = 0 186 """The border line width. Always 0. 187 (For compatibility with `Path.width`.)"""
Parameters
- obj: Page object of the image.
Number of segments. Always 4 due to rectangular image form.
(For compatibility with Path.count
.)
189 @cached_property 190 def matrix(self) -> pp.PdfMatrix: 191 """The transformation matrix.""" 192 return self.get_matrix()
The transformation matrix.
194 @cached_property 195 def bbox(self) -> Rectangle: 196 """The bounding box of the image.""" 197 bbox = Rectangle(*self.get_pos()) 198 if self.page.rotation: 199 bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, 200 bbox.p1.y, self.page.height - bbox.p0.x) 201 return bbox
The bounding box of the image.
203 @cached_property 204 def points(self) -> list[Point]: 205 """ 206 The 4 points of the bounding box. 207 (For compatibility with `Path.points`.) 208 """ 209 points = self.bbox.points 210 if self.page.rotation: 211 points = [Point(p.y, self.page.height - p.x, p.type) for p in points] 212 return points
The 4 points of the bounding box.
(For compatibility with Path.points
.)
214 @cached_property 215 def lines(self) -> list[Line]: 216 """ 217 The 4 lines of the bounding box. 218 (For compatibility with `Path.lines`.) 219 """ 220 p = self.points 221 return [Line(p[0], p[1], p[1].type, 0), Line(p[1], p[2], p[2].type, 0), 222 Line(p[2], p[3], p[3].type, 0), Line(p[3], p[0], p[0].type, 0)]
The 4 lines of the bounding box.
(For compatibility with Path.lines
.)
Inherited Members
- pypdfium2._helpers.pageobjects.PdfImage
- SIMPLE_FILTERS
- new
- get_metadata
- get_size
- load_jpeg
- set_bitmap
- get_bitmap
- get_data
- get_filters
- extract
- pypdfium2._helpers.pageobjects.PdfObject
- parent
- get_pos
- get_matrix
- set_matrix
- transform
- pypdfium2.internal.bases.AutoCloseable
- close