modm_data.pdf

PDF Content Accessors

This module extends the pypdfium2 Python API with low-level accessors for characters and graphics. Note that these modules support read-only access to PDFs, since a lot of caching is used to speed up commonly accessed properties.

This module only contains formatting independent PDF access which is then specialized in the vendor-specific modm_data.pdf2html modules.

View Source

 1# Copyright 2022, Niklas Hauser
 2# SPDX-License-Identifier: MPL-2.0
 3
 4"""
 5# PDF Content Accessors
 6
 7This module extends the pypdfium2 Python API with low-level accessors for
 8characters and graphics. Note that these modules support read-only access to
 9PDFs, since a lot of caching is used to speed up commonly accessed properties.
10
11This module only contains formatting independent PDF access which is then
12specialized in the vendor-specific `modm_data.pdf2html` modules.
13"""
14
15from .document import Document
16from .page import Page
17from .character import Character
18from .link import ObjLink, WebLink
19from .path import Path
20from .image import Image
21from .render import annotate_debug_info
22from .structure import Structure
23
24__all__ = [
25    "annotate_debug_info",
26    "Document",
27    "Page",
28    "Character",
29    "Path",
30    "Image",
31    "ObjLink",
32    "WebLink",
33    "Structure",
34]

def annotate_debug_info( page: Page, new_doc: pypdfium2._helpers.document.PdfDocument = None, index: int = 0) -> pypdfium2._helpers.document.PdfDocument: View Source

 52def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument:
 53    """
 54    Copies each page into a new or existing PDF document and overlays the internal information on top of the content.
 55    - Renders the bounding boxes in RED and origins in BLACK of all characters.
 56    - Renders the bounding boxes of web links in BLUE GREEN.
 57    - Renders the bounding boxes of object links in YELLOW GREEN.
 58    - Renders all graphics paths in BLUE.
 59    - Renders the bounding boxes of computed graphics clusters in CYAN.
 60
 61    :param page: The page to be annotated.
 62    :param new_doc: The PDF document to copy the page to. If not provided, a new document is created.
 63    :param index: The index of the page in the new document.
 64    :return: The new document with the annotated page added.
 65    """
 66    _, height = page.width, page.height
 67
 68    if new_doc is None:
 69        new_doc = pp.raw.FPDF_CreateNewDocument()
 70    # copy page over to new doc
 71    assert pp.raw.FPDF_ImportPages(new_doc, page.pdf, str(page.number).encode("ascii"), index)
 72    new_page = pp.raw.FPDF_LoadPage(new_doc, index)
 73    rotation = page.rotation
 74
 75    for path in page.paths:
 76        p0 = path.points[0]
 77        if rotation:
 78            obj = pp.raw.FPDFPageObj_CreateNewPath(height - p0.y, p0.x)
 79        else:
 80            obj = pp.raw.FPDFPageObj_CreateNewPath(p0.x, p0.y)
 81        assert pp.raw.FPDFPageObj_SetStrokeColor(obj, 0, 0, 0xFF, 0xC0)
 82        assert pp.raw.FPDFPageObj_SetStrokeWidth(obj, 0.25)
 83        assert pp.raw.FPDFPageObj_SetLineJoin(obj, pp.raw.FPDF_LINEJOIN_ROUND)
 84        assert pp.raw.FPDFPageObj_SetLineCap(obj, pp.raw.FPDF_LINECAP_ROUND)
 85        assert pp.raw.FPDFPath_SetDrawMode(obj, 0, True)
 86        for point in path.points[1:]:
 87            if point.type == path.Type.MOVE:
 88                if rotation:
 89                    assert pp.raw.FPDFPath_MoveTo(obj, height - point.y, point.x)
 90                else:
 91                    assert pp.raw.FPDFPath_MoveTo(obj, point.x, point.y)
 92            else:
 93                if rotation:
 94                    assert pp.raw.FPDFPath_LineTo(obj, height - point.y, point.x)
 95                else:
 96                    assert pp.raw.FPDFPath_LineTo(obj, point.x, point.y)
 97        pp.raw.FPDFPage_InsertObject(new_page, obj)
 98
 99    for bbox, _ in page.graphic_clusters():
100        _rect(new_page, rotation, bbox, width=2, stroke=0x00FFFF)
101
102    for link in page.objlinks:
103        _rect(new_page, rotation, link.bbox, width=0.75, stroke=0x9ACD32)
104
105    for link in page.weblinks:
106        for bbox in link.bboxes:
107            _rect(new_page, rotation, bbox, width=0.75, stroke=0x00FF00)
108
109    for char in page.chars:
110        color = 0x0000FF
111        if char.bbox.width:
112            _rect(new_page, rotation, char.bbox, width=0.5, stroke=0xFF0000)
113            _vline(
114                new_page,
115                rotation,
116                char.bbox.midpoint.x,
117                char.bbox.midpoint.y - 1,
118                char.bbox.midpoint.y + 1,
119                width=0.25,
120                stroke=0xFF0000,
121            )
122            _hline(
123                new_page,
124                rotation,
125                char.bbox.midpoint.y,
126                char.bbox.midpoint.x - 1,
127                char.bbox.midpoint.x + 1,
128                width=0.25,
129                stroke=0xFF0000,
130            )
131            color = 0x000000
132        _vline(new_page, rotation, char.origin.x, char.origin.y - 1, char.origin.y + 1, width=0.25, stroke=color)
133        _hline(new_page, rotation, char.origin.y, char.origin.x - 1, char.origin.x + 1, width=0.25, stroke=color)
134
135    assert pp.raw.FPDFPage_GenerateContent(new_page)
136    pp.raw.FPDF_ClosePage(new_page)
137    return new_doc

Copies each page into a new or existing PDF document and overlays the internal information on top of the content.

Renders the bounding boxes in RED and origins in BLACK of all characters.
Renders the bounding boxes of web links in BLUE GREEN.
Renders the bounding boxes of object links in YELLOW GREEN.
Renders all graphics paths in BLUE.
Renders the bounding boxes of computed graphics clusters in CYAN.

Parameters

page: The page to be annotated.
new_doc: The PDF document to copy the page to. If not provided, a new document is created.
index: The index of the page in the new document.

Returns

The new document with the annotated page added.

class Character.RenderMode(enum.Enum): View Source

27    class RenderMode(Enum):
28        """Tells the PDF viewer how to render this character glyph."""
29
30        UNKNOWN = -1
31        FILL = 0
32        STROKE = 1
33        FILL_STROKE = 2
34        INVISIBLE = 3
35        FILL_CLIP = 4
36        STROKE_CLIP = 5
37        FILL_STROKE_CLIP = 6
38        CLIP = 7

Tells the PDF viewer how to render this character glyph.

UNKNOWN = <RenderMode.UNKNOWN: -1>

FILL = <RenderMode.FILL: 0>

STROKE = <RenderMode.STROKE: 1>

FILL_STROKE = <RenderMode.FILL_STROKE: 2>

INVISIBLE = <RenderMode.INVISIBLE: 3>

FILL_CLIP = <RenderMode.FILL_CLIP: 4>

STROKE_CLIP = <RenderMode.STROKE_CLIP: 5>

FILL_STROKE_CLIP = <RenderMode.FILL_STROKE_CLIP: 6>

CLIP = <RenderMode.CLIP: 7>

Inherited Members

enum.Enum: name; value

class Path.Type(enum.Enum): View Source

24    class Type(Enum):
25        """Path Type"""
26
27        LINE = 0
28        BEZIER = 1
29        MOVE = 2

Path Type

LINE = <Type.LINE: 0>

BEZIER = <Type.BEZIER: 1>

MOVE = <Type.MOVE: 2>

Inherited Members

enum.Enum: name; value

class Path.Cap(enum.Enum): View Source

31    class Cap(Enum):
32        """Path Cap Type"""
33
34        BUTT = 0
35        ROUND = 1
36        PROJECTING_SQUARE = 2

Path Cap Type

BUTT = <Cap.BUTT: 0>

ROUND = <Cap.ROUND: 1>

PROJECTING_SQUARE = <Cap.PROJECTING_SQUARE: 2>

Inherited Members

enum.Enum: name; value

class Path.Join(enum.Enum): View Source

38    class Join(Enum):
39        """Path Join Type"""
40
41        MITER = 0
42        ROUND = 1
43        BEVEL = 2

Path Join Type

MITER = <Join.MITER: 0>

ROUND = <Join.ROUND: 1>

BEVEL = <Join.BEVEL: 2>

Inherited Members

enum.Enum: name; value

class Image(pypdfium2._helpers.pageobjects.PdfImage): View Source

10class Image(pp.PdfImage):
11    """
12    This class extends `pypdfium2.PdfImage` to align it with the interface of
13    the `Path` class so that it can be used in the same
14    algorithms without filtering.
15
16    You must construct the images by calling `modm_data.pdf.page.Page.images`.
17
18    .. note:: Images are currently ignored.
19    """
20
21    # Overwrite the PdfPageObject.__new__ function
22    def __new__(cls, *args, **kwargs):
23        return object.__new__(cls)
24
25    def __init__(self, obj):
26        """
27        :param obj: Page object of the image.
28        """
29        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
30        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
31        self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
32
33        self.count: int = 4
34        """Number of segments. Always 4 due to rectangular image form.
35           (For compatibility with `Path.count`.)"""
36        self.stroke: int = 0
37        """The border stroke color. Always 0.
38           (For compatibility with `Path.stroke`.)"""
39        self.fill: int = 0
40        """The image fill color. Always 0.
41           (For compatibility with `Path.fill`.)"""
42        self.width: float = 0
43        """The border line width. Always 0.
44           (For compatibility with `Path.width`.)"""
45
46    @cached_property
47    def matrix(self) -> pp.PdfMatrix:
48        """The transformation matrix."""
49        return self.get_matrix()
50
51    @cached_property
52    def bbox(self) -> Rectangle:
53        """The bounding box of the image."""
54        bbox = Rectangle(*self.get_pos())
55        if self.page.rotation:
56            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
57        return bbox
58
59    @cached_property
60    def points(self) -> list[Point]:
61        """
62        The 4 points of the bounding box.
63        (For compatibility with `Path.points`.)
64        """
65        points = self.bbox.points
66        if self.page.rotation:
67            points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
68        return points
69
70    @cached_property
71    def lines(self) -> list[Line]:
72        """
73        The 4 lines of the bounding box.
74        (For compatibility with `Path.lines`.)
75        """
76        p = self.points
77        return [
78            Line(p[0], p[1], p[1].type, 0),
79            Line(p[1], p[2], p[2].type, 0),
80            Line(p[2], p[3], p[3].type, 0),
81            Line(p[3], p[0], p[0].type, 0),
82        ]
83
84    def __repr__(self) -> str:
85        return f"I{self.bbox}"

This class extends pypdfium2.PdfImage to align it with the interface of the Path class so that it can be used in the same algorithms without filtering.

You must construct the images by calling modm_data.pdf.page.Page.images.

Images are currently ignored.

Image(obj) View Source

25    def __init__(self, obj):
26        """
27        :param obj: Page object of the image.
28        """
29        super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
30        assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
31        self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
32
33        self.count: int = 4
34        """Number of segments. Always 4 due to rectangular image form.
35           (For compatibility with `Path.count`.)"""
36        self.stroke: int = 0
37        """The border stroke color. Always 0.
38           (For compatibility with `Path.stroke`.)"""
39        self.fill: int = 0
40        """The image fill color. Always 0.
41           (For compatibility with `Path.fill`.)"""
42        self.width: float = 0
43        """The border line width. Always 0.
44           (For compatibility with `Path.width`.)"""

Parameters

obj: Page object of the image.

type

count: int

Number of segments. Always 4 due to rectangular image form. (For compatibility with Path.count.)

stroke: int

The border stroke color. Always 0. (For compatibility with Path.stroke.)

fill: int

The image fill color. Always 0. (For compatibility with Path.fill.)

width: float

The border line width. Always 0. (For compatibility with Path.width.)

matrix: pypdfium2._helpers.matrix.PdfMatrix View Source

46    @cached_property
47    def matrix(self) -> pp.PdfMatrix:
48        """The transformation matrix."""
49        return self.get_matrix()

The transformation matrix.

bbox: modm_data.utils.Rectangle View Source

51    @cached_property
52    def bbox(self) -> Rectangle:
53        """The bounding box of the image."""
54        bbox = Rectangle(*self.get_pos())
55        if self.page.rotation:
56            bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, bbox.p1.y, self.page.height - bbox.p0.x)
57        return bbox

The bounding box of the image.

points: list[modm_data.utils.Point] View Source

59    @cached_property
60    def points(self) -> list[Point]:
61        """
62        The 4 points of the bounding box.
63        (For compatibility with `Path.points`.)
64        """
65        points = self.bbox.points
66        if self.page.rotation:
67            points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
68        return points

The 4 points of the bounding box. (For compatibility with Path.points.)

lines: list[modm_data.utils.Line] View Source

70    @cached_property
71    def lines(self) -> list[Line]:
72        """
73        The 4 lines of the bounding box.
74        (For compatibility with `Path.lines`.)
75        """
76        p = self.points
77        return [
78            Line(p[0], p[1], p[1].type, 0),
79            Line(p[1], p[2], p[2].type, 0),
80            Line(p[2], p[3], p[3].type, 0),
81            Line(p[3], p[0], p[0].type, 0),
82        ]

The 4 lines of the bounding box. (For compatibility with Path.lines.)

Inherited Members

pypdfium2._helpers.pageobjects.PdfImage: SIMPLE_FILTERS; new; get_metadata; get_size; load_jpeg; set_bitmap; get_bitmap; get_data; get_filters; extract
pypdfium2._helpers.pageobjects.PdfObject: parent; get_pos; get_matrix; set_matrix; transform
pypdfium2.internal.bases.AutoCloseable: close

class ObjLink: View Source

11class ObjLink:
12    """
13    An internal reference to other objects by an identifier giving the bounding
14    box and destination page. These links can be extracted by calling the
15    `modm_data.pdf.page.Page.objlinks` property.
16    """
17
18    def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK):  # noqa: F821
19        """
20        :param page: Page containing the link, used to compute bounding box.
21        :param link: Raw link object.
22        """
23        self._page = page
24        self._dest = pp.raw.FPDFLink_GetDest(page.pdf, link)
25
26        bbox = pp.raw.FS_RECTF()
27        assert pp.raw.FPDFLink_GetAnnotRect(link, bbox)
28        bbox = Rectangle(bbox)
29        if page.rotation:
30            bbox = Rectangle(bbox.p0.y, page.height - bbox.p1.x, bbox.p1.y, page.height - bbox.p0.x)
31        self.bbox: Rectangle = bbox
32        """Bounding box of the link source"""
33
34    @cached_property
35    def page_index(self) -> int:
36        """0-indexed page number of the link destination."""
37        return pp.raw.FPDFDest_GetDestPageIndex(self._page.pdf, self._dest)
38
39    def __repr__(self) -> str:
40        return f"Obj({self.page_index})"

An internal reference to other objects by an identifier giving the bounding box and destination page. These links can be extracted by calling the modm_data.pdf.page.Page.objlinks property.

ObjLink( page: Page, link: pypdfium2_raw.bindings.LP_struct_fpdf_link_t__) View Source

18    def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK):  # noqa: F821
19        """
20        :param page: Page containing the link, used to compute bounding box.
21        :param link: Raw link object.
22        """
23        self._page = page
24        self._dest = pp.raw.FPDFLink_GetDest(page.pdf, link)
25
26        bbox = pp.raw.FS_RECTF()
27        assert pp.raw.FPDFLink_GetAnnotRect(link, bbox)
28        bbox = Rectangle(bbox)
29        if page.rotation:
30            bbox = Rectangle(bbox.p0.y, page.height - bbox.p1.x, bbox.p1.y, page.height - bbox.p0.x)
31        self.bbox: Rectangle = bbox
32        """Bounding box of the link source"""

Parameters

page: Page containing the link, used to compute bounding box.
link: Raw link object.

bbox: modm_data.utils.Rectangle

Bounding box of the link source

page_index: int View Source

34    @cached_property
35    def page_index(self) -> int:
36        """0-indexed page number of the link destination."""
37        return pp.raw.FPDFDest_GetDestPageIndex(self._page.pdf, self._dest)

0-indexed page number of the link destination.

class WebLink: View Source

43class WebLink:
44    """
45    An external reference to URLs giving the bounding box and destination URL.
46    These links can be extracted by calling the
47    `modm_data.pdf.page.Page.weblinks` property.
48    """
49
50    def __init__(self, page: "modm_data.pdf.Page", index: int):  # noqa: F821
51        """
52        :param page: Page containing the link, used to compute bounding box.
53        :param index: 0-index of the weblink object.
54        """
55        self._page = page
56        self._link = page._linkpage
57        self._index = index
58
59    @cached_property
60    def bbox_count(self) -> int:
61        """The number of bounding boxes associated with this weblink."""
62        return pp.raw.FPDFLink_CountRects(self._link, self._index)
63
64    @cached_property
65    def bboxes(self) -> list[Rectangle]:
66        """The bounding boxes associated with this weblink."""
67        bboxes = []
68        for ii in range(self.bbox_count):
69            x0, y0 = ctypes.c_double(), ctypes.c_double()
70            x1, y1 = ctypes.c_double(), ctypes.c_double()
71            assert pp.raw.FPDFLink_GetRect(self._link, self._index, ii, x0, y1, x1, y0)
72            bboxes.append(Rectangle(x0.value, y0.value, x1.value, y1.value))
73        if self._page.rotation:
74            bboxes = [
75                Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x)
76                for bbox in bboxes
77            ]
78        return bboxes
79
80    @cached_property
81    def range(self) -> tuple[int, int]:
82        """Start and end index of the characters associated with this link."""
83        cstart = ctypes.c_int()
84        ccount = ctypes.c_int()
85        assert pp.raw.FPDFLink_GetTextRange(self._link, self._index, cstart, ccount)
86        return (cstart.value, cstart.value + ccount.value)
87
88    @cached_property
89    def url(self) -> str:
90        """The URL string of this link."""
91        length = 1000
92        cbuffer = ctypes.c_ushort * length
93        cbuffer = cbuffer()
94        retlen = pp.raw.FPDFLink_GetURL(self._link, self._index, cbuffer, length)
95        assert retlen < length
96        return bytes(cbuffer).decode("utf-16-le").strip("\x00")
97
98    def __repr__(self) -> str:
99        return f"Url({self.url})"

An external reference to URLs giving the bounding box and destination URL. These links can be extracted by calling the modm_data.pdf.page.Page.weblinks property.

WebLink(page: Page, index: int) View Source

50    def __init__(self, page: "modm_data.pdf.Page", index: int):  # noqa: F821
51        """
52        :param page: Page containing the link, used to compute bounding box.
53        :param index: 0-index of the weblink object.
54        """
55        self._page = page
56        self._link = page._linkpage
57        self._index = index

Parameters

page: Page containing the link, used to compute bounding box.
index: 0-index of the weblink object.

bbox_count: int View Source

59    @cached_property
60    def bbox_count(self) -> int:
61        """The number of bounding boxes associated with this weblink."""
62        return pp.raw.FPDFLink_CountRects(self._link, self._index)

The number of bounding boxes associated with this weblink.

bboxes: list[modm_data.utils.Rectangle] View Source

64    @cached_property
65    def bboxes(self) -> list[Rectangle]:
66        """The bounding boxes associated with this weblink."""
67        bboxes = []
68        for ii in range(self.bbox_count):
69            x0, y0 = ctypes.c_double(), ctypes.c_double()
70            x1, y1 = ctypes.c_double(), ctypes.c_double()
71            assert pp.raw.FPDFLink_GetRect(self._link, self._index, ii, x0, y1, x1, y0)
72            bboxes.append(Rectangle(x0.value, y0.value, x1.value, y1.value))
73        if self._page.rotation:
74            bboxes = [
75                Rectangle(bbox.p0.y, self._page.height - bbox.p1.x, bbox.p1.y, self._page.height - bbox.p0.x)
76                for bbox in bboxes
77            ]
78        return bboxes

The bounding boxes associated with this weblink.

range: tuple[int, int] View Source

80    @cached_property
81    def range(self) -> tuple[int, int]:
82        """Start and end index of the characters associated with this link."""
83        cstart = ctypes.c_int()
84        ccount = ctypes.c_int()
85        assert pp.raw.FPDFLink_GetTextRange(self._link, self._index, cstart, ccount)
86        return (cstart.value, cstart.value + ccount.value)

Start and end index of the characters associated with this link.

url: str View Source

88    @cached_property
89    def url(self) -> str:
90        """The URL string of this link."""
91        length = 1000
92        cbuffer = ctypes.c_ushort * length
93        cbuffer = cbuffer()
94        retlen = pp.raw.FPDFLink_GetURL(self._link, self._index, cbuffer, length)
95        assert retlen < length
96        return bytes(cbuffer).decode("utf-16-le").strip("\x00")

The URL string of this link.