modm_data.pdf.link

Inter-PDF References and External Links

PDF contains two types of links:

  1. Internal references to other objects by identifier: ObjLink.
  2. External links to URLs: WebLink.

Both types can be extracted by calling the modm_data.pdf.page.Page.objlinks and modm_data.pdf.page.Page.weblinks properties.

  1# Copyright 2022, Niklas Hauser
  2# SPDX-License-Identifier: MPL-2.0
  3
  4"""
  5# Inter-PDF References and External Links
  6
  7PDF contains two types of links:
  81. Internal references to other objects by identifier: `ObjLink`.
  92. External links to URLs: `WebLink`.
 10
 11Both types can be extracted by calling the `modm_data.pdf.page.Page.objlinks`
 12and `modm_data.pdf.page.Page.weblinks` properties.
 13"""
 14
 15import copy
 16import ctypes
 17from functools import cached_property
 18import pypdfium2 as pp
 19from ..utils import Rectangle
 20
 21
 22class ObjLink:
 23    """A link to a PDF object giving the bounding box and destination page."""
 24    def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK):
 25        """
 26        :param page: Page containing the link, used to compute bounding box.
 27        :param link: Raw link object.
 28        """
 29        self._page = page
 30        self._dest = pp.raw.FPDFLink_GetDest(page.pdf, link)
 31
 32        bbox = pp.raw.FS_RECTF()
 33        assert pp.raw.FPDFLink_GetAnnotRect(link, bbox)
 34        bbox = Rectangle(bbox)
 35        if page.rotation:
 36            bbox = Rectangle(bbox.p0.y, page.height - bbox.p1.x,
 37                             bbox.p1.y, page.height - bbox.p0.x)
 38        self.bbox: Rectangle = bbox
 39        """Bounding box of the link source"""
 40
 41    @cached_property
 42    def page_index(self) -> int:
 43        """0-indexed page number of the link destination."""
 44        return pp.raw.FPDFDest_GetDestPageIndex(self._page.pdf, self._dest)
 45
 46    def __repr__(self) -> str:
 47        return f"Obj({self.page_index})"
 48
 49
 50class WebLink:
 51    """A weblink object giving the bounding box and destination URL."""
 52    def __init__(self, page: "modm_data.pdf.Page", index: int):
 53        """
 54        :param page: Page containing the link, used to compute bounding box.
 55        :param index: 0-index of the weblink object.
 56        """
 57        self._page = page
 58        self._link = page._linkpage
 59        self._index = index
 60
 61    @cached_property
 62    def bbox_count(self) -> int:
 63        """The number of bounding boxes associated with this weblink."""
 64        return pp.raw.FPDFLink_CountRects(self._link, self._index)
 65
 66    @cached_property
 67    def bboxes(self) -> list[Rectangle]:
 68        """The bounding boxes associated with this weblink."""
 69        bboxes = []
 70        for ii in range(self.bbox_count):
 71            x0, y0 = ctypes.c_double(), ctypes.c_double()
 72            x1, y1 = ctypes.c_double(), ctypes.c_double()
 73            assert pp.raw.FPDFLink_GetRect(self._link, self._index, ii, x0, y1, x1, y0)
 74            bboxes.append(Rectangle(x0.value, y0.value, x1.value, y1.value))
 75        if self._page.rotation:
 76            bboxes = [Rectangle(bbox.p0.y, self._page.height - bbox.p1.x,
 77                                bbox.p1.y, self._page.height - bbox.p0.x)
 78                      for bbox in bboxes]
 79        return bboxes
 80
 81    @cached_property
 82    def range(self) -> tuple[int, int]:
 83        """Start and end index of the characters associated with this link."""
 84        cstart = ctypes.c_int()
 85        ccount = ctypes.c_int()
 86        assert pp.raw.FPDFLink_GetTextRange(self._link, self._index, cstart, ccount)
 87        return (cstart.value, cstart.value + ccount.value)
 88
 89    @cached_property
 90    def url(self) -> str:
 91        """The URL string of this link."""
 92        length = 1000
 93        cbuffer = ctypes.c_ushort * length
 94        cbuffer = cbuffer()
 95        retlen = pp.raw.FPDFLink_GetURL(self._link, self._index, cbuffer, length)
 96        assert retlen < length
 97        return bytes(cbuffer).decode("utf-16-le").strip("\x00")
 98
 99    def __repr__(self) -> str:
100        return f"Url({self.url})"