modm_data.pdf2html.page API documentation

Page(document, index: int) View Source

22    def __init__(self, document, index: int):
23        super().__init__(document, index)
24        self._template = "default"
25        self.is_relevant: bool = True
26        """Is this page relevant for the conversion?"""

Parameters

document: a PDF document.
index: 0-index page number.

is_relevant: bool

Is this page relevant for the conversion?

def text_in_named_area(self, name: str, check_length: bool = True) -> str | None: View Source

109    def text_in_named_area(self, name: str, check_length: bool = True) -> str | None:
110        """
111        Find all text in the named area.
112
113        :param name: the name of the area(s) to query.
114        :param check_length: assert that the text has a length.
115        :return: the concatenated text of the named area(s) or `None` if area not found.
116        """
117        if name not in self._areas:
118            return None
119        text = ""
120        areas = self._areas[name]
121        if not isinstance(areas, list):
122            areas = [areas]
123        for area in areas:
124            text += self.text_in_area(area)
125        if check_length:
126            assert text
127        return text

Find all text in the named area.

Parameters

name: the name of the area(s) to query.
check_length: assert that the text has a length.

Returns

the concatenated text of the named area(s) or None if area not found.

def charlines_in_area( self, area: modm_data.utils.Rectangle, predicate: Callable[[modm_data.pdf.Character], bool] = None, rtol: float = None) -> list[modm_data.pdf2html.line.CharLine]: View Source

129    def charlines_in_area(
130        self, area: Rectangle, predicate: Callable[[Character], bool] = None, rtol: float = None
131    ) -> list[CharLine]:
132        """
133        Coalesce the characters in the area and predicate into lines.
134
135        1. Every character in the area is filtered by the `predicate`.
136        2. Character orientation is split into horizontal (left->right) and
137           vertical (bottom->top) character lines sorted by x or y position.
138           Lines containing only whitespace are discarded.
139        3. Overlapping character lines are merged into sub- and superscript
140           using `rtol * max(current_line.height, next_line.height)` as the
141           tolerance for checking if the lines overlap.
142        4. The characters in the merged lines are re-sorted by origin.
143
144        :param area: Area to search for characters.
145        :param predicate: Function to discard characters in the area or include all by default.
146        :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default.
147        :return: A list of character lines sorted by x or y position.
148        """
149        if rtol is None:
150            rtol = self._spacing["sc"]
151        # Split all chars into lines based on rounded origin
152        origin_lines_y = defaultdict(list)
153        origin_lines_x = defaultdict(list)
154        for char in self.chars_in_area(area):
155            # Ignore all characters we don't want
156            if predicate is not None and not predicate(char):
157                continue
158            cunicode = self._unicode_filter(char.unicode)
159            if cunicode is None:
160                continue
161            char.unicode = cunicode
162            if char.unicode < 32 and char.unicode not in {0xA}:
163                continue
164            # Ignore characters without width that are not spaces
165            if not char.width and char.unicode not in {0xA, 0xD, 0x20}:
166                _LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
167            # Split up the chars depending on the orientation
168            if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
169                origin_lines_x[round(char.origin.x, 1)].append(char)
170            elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
171                origin_lines_y[round(char.origin.y, 1)].append(char)
172            else:
173                _LOGGER.error("Unknown char rotation:", char, char.rotation)
174
175        # Convert characters into lines
176        bbox_lines_y = []
177        for chars in origin_lines_y.values():
178            # Remove lines with whitespace only
179            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
180                continue
181            origin = statistics.fmean(c.origin.y for c in chars)
182            line = CharLine(
183                self,
184                chars,
185                min(c.bbox.bottom for c in chars),
186                origin,
187                max(c.bbox.top for c in chars),
188                max(c.height for c in chars),
189                sort_origin=self.height - origin,
190            )
191            bbox_lines_y.append(line)
192            # print(line, line.top, line.origin, line.bottom, line.height)
193        bbox_lines = sorted(bbox_lines_y, key=lambda line: line._sort_origin)
194
195        bbox_lines_x = []
196        for chars in origin_lines_x.values():
197            # Remove lines with whitespace only
198            if all(c.unicode in {0xA, 0xD, 0x20} for c in chars):
199                continue
200            line = CharLine(
201                self,
202                chars,
203                min(c.bbox.left for c in chars),
204                statistics.fmean(c.origin.x for c in chars),
205                max(c.bbox.right for c in chars),
206                max(c.width for c in chars),
207                270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90,
208            )
209            bbox_lines_x.append(line)
210        bbox_lines += sorted(bbox_lines_x, key=lambda line: line._sort_origin)
211
212        if not bbox_lines:
213            return []
214
215        # Merge lines that have overlapping bbox_lines
216        # FIXME: This merges lines that "collide" vertically like in formulas
217        merged_lines = []
218        current_line = bbox_lines[0]
219        for next_line in bbox_lines[1:]:
220            height = max(current_line.height, next_line.height)
221            # Calculate overlap via normalize origin (increasing with line index)
222            if (current_line._sort_origin + rtol * height) > (next_line._sort_origin - rtol * height):
223                # if line.rotation or self.rotation:
224                #     # The next line overlaps this one, we merge the shorter line
225                #     # (typically super- and subscript) into taller line
226                #     use_current = len(current_line.chars) >= len(next_line.chars)
227                # else:
228                use_current = current_line.height >= next_line.height
229                line = current_line if use_current else next_line
230                current_line = CharLine(
231                    self,
232                    current_line.chars + next_line.chars,
233                    line.bottom,
234                    line.origin,
235                    line.top,
236                    height,
237                    line.rotation,
238                    sort_origin=line._sort_origin,
239                )
240            else:
241                # The next line does not overlap the current line
242                merged_lines.append(current_line)
243                current_line = next_line
244        # append last line
245        merged_lines.append(current_line)
246
247        # Sort all lines horizontally based on character origin
248        sorted_lines = []
249        for line in merged_lines:
250            if line.rotation == 90:
251
252                def sort_key(char):
253                    if char.unicode in {0xA, 0xD}:
254                        return char.tbbox.midpoint.y - 1e9
255                    return char.tbbox.midpoint.y
256            elif line.rotation == 270:
257
258                def sort_key(char):
259                    if char.unicode in {0xA, 0xD}:
260                        return -char.tbbox.midpoint.y + 1e9
261                    return -char.tbbox.midpoint.y
262            else:
263
264                def sort_key(char):
265                    if char.unicode in {0xA, 0xD}:
266                        return char.origin.x + 1e9
267                    return char.origin.x
268
269            sorted_lines.append(
270                CharLine(
271                    self,
272                    sorted(line.chars, key=sort_key),
273                    line.bottom,
274                    line.origin,
275                    line.top,
276                    line.height,
277                    line.rotation,
278                    area.left,
279                    sort_origin=line._sort_origin,
280                )
281            )
282
283        return sorted_lines

Coalesce the characters in the area and predicate into lines.

Every character in the area is filtered by the predicate.
Character orientation is split into horizontal (left->right) and vertical (bottom->top) character lines sorted by x or y position. Lines containing only whitespace are discarded.
Overlapping character lines are merged into sub- and superscript using rtol * max(current_line.height, next_line.height) as the tolerance for checking if the lines overlap.
The characters in the merged lines are re-sorted by origin.

Parameters

area: Area to search for characters.
predicate: Function to discard characters in the area or include all by default.
rtol: Relative tolerance to separate lines vertically or use sc spacing by default.

Returns

A list of character lines sorted by x or y position.

def graphic_bboxes_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True) -> list[tuple[modm_data.utils.Rectangle, modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure | None]]: View Source

285    def graphic_bboxes_in_area(
286        self, area: Rectangle, with_graphics: bool = True
287    ) -> list[tuple[Rectangle, Table | Figure | None]]:
288        """
289        Coalesce the graphics in the area into full width bounding boxes.
290
291        1. Group vertically overlapping graphics.
292        2. Widen the overlapped graphics bounding boxes to the edges of the area.
293
294        :param area: area to search for content.
295        :param with_graphics: search for graphics in the area.
296        :return: list of tuples (bounding box, graphic objects or `None`).
297        """
298        if with_graphics:
299            graphics = self.graphics_in_area(area)
300            regions = []
301            # Check if graphics bounding boxes overlap vertically and group them
302            for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
303                gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
304                for reg in regions:
305                    if reg.overlaps(gbbox.bottom, gbbox.top):
306                        # They overlap, so merge them
307                        reg.v0 = min(reg.v0, gbbox.bottom)
308                        reg.v1 = max(reg.v1, gbbox.top)
309                        reg.objs.append(graphic)
310                        break
311                else:
312                    regions.append(Region(gbbox.bottom, gbbox.top, graphic))
313
314            # print(regions)
315            # Coalesce all overlapped graphics objects into full width areas
316            areas = []
317            ypos = area.top
318            for reg in regions:
319                if ypos - reg.v1 > self._spacing["y_em"]:
320                    areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
321                for obj in reg.objs:
322                    oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
323                    areas.append((oarea, obj))
324                ypos = reg.v0
325            areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
326        else:
327            areas = [(area, None)]
328        return areas

Coalesce the graphics in the area into full width bounding boxes.

Group vertically overlapping graphics.
Widen the overlapped graphics bounding boxes to the edges of the area.

Parameters

area: area to search for content.
with_graphics: search for graphics in the area.

Returns

list of tuples (bounding box, graphic objects or None).

def objects_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True) -> list[modm_data.pdf2html.line.CharLine | modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure]: View Source

330    def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]:
331        """
332        Find all content objects in this area.
333
334        :param area: area to search for content.
335        :param with_graphics: search for graphics in the area.
336        :return: list of content objects sorted top to bottom.
337        """
338        self._link_characters()
339        areas = self.graphic_bboxes_in_area(area, with_graphics)
340        objects = []
341        for narea, obj in areas:
342            if obj is None:
343                objects += self.charlines_in_area(narea)
344            else:
345                oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
346
347                def predicate(c):
348                    return not obj.bbox.contains(c.origin)
349
350                lines = self.charlines_in_area(oarea, predicate)
351                # print(obj, oarea, lines, [line.content for line in lines])
352                objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
353        return objects

Find all content objects in this area.

Parameters

area: area to search for content.
with_graphics: search for graphics in the area.

Returns

list of content objects sorted top to bottom.

def graphics_in_area( self, area: modm_data.utils.Rectangle) -> list[modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure]: View Source

355    def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
356        """
357        Find all tables and figures in this area.
358
359        :param area: area to search for graphics.
360        :return: list of tables and figures.
361        """
362        return []

Find all tables and figures in this area.

Parameters

area: area to search for graphics.

Returns

list of tables and figures.

def ast_in_area( self, area: modm_data.utils.Rectangle, with_graphics: bool = True) -> anytree.node.node.Node: View Source

364    def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node:
365        """
366        Convert the area content into an abstract syntax tree.
367
368        :param area: area to search for content.
369        :param with_graphics: including graphics in the area.
370        :return: An abstract syntax tree including the content formatting.
371        """
372        return Node("area", obj=area, xpos=int(area.left), page=self)

Convert the area content into an abstract syntax tree.

Parameters

area: area to search for content.
with_graphics: including graphics in the area.

Returns

An abstract syntax tree including the content formatting.

content_ast: list[anytree.node.node.Node] View Source

374    @property
375    def content_ast(self) -> list[Node]:
376        """The abstract syntax trees in the content area."""
377        ast = []
378        with_graphics = True
379        for area in self._areas["content"]:
380            ast.append(self.ast_in_area(area, with_graphics=with_graphics))
381        # Add a page node to the first leaf to keep track of where a page starts
382        first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
383        Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
384        return ast

The abstract syntax trees in the content area.

content_objects: list[modm_data.pdf2html.line.CharLine | modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure] View Source

386    @property
387    def content_objects(self) -> list[CharLine | Table | Figure]:
388        """All objects in the content areas."""
389        objs = []
390        for area in self._areas["content"]:
391            objs.extend(self.objects_in_area(area))
392        return objs

All objects in the content areas.

content_graphics: list[modm_data.pdf2html.table.Table | modm_data.pdf2html.figure.Figure] View Source

394    @property
395    def content_graphics(self) -> list[Table | Figure]:
396        """All graphics in the content areas."""
397        objs = []
398        for area in self._areas["content"]:
399            objs.extend(self.graphics_in_area(area))
400        return objs

All graphics in the content areas.

content_lines: list[modm_data.pdf2html.line.CharLine] View Source

402    @property
403    def content_lines(self) -> list[CharLine]:
404        """All lines in the content areas."""
405        objs = []
406        for area in self._areas["content"]:
407            objs.extend(self.charlines_in_area(area))
408        return objs

All lines in the content areas.

content_tables: list[modm_data.pdf2html.table.Table] View Source

410    @property
411    def content_tables(self) -> list[Table]:
412        """All tables in the content areas."""
413        return [o for o in self.content_graphics if isinstance(o, Table)]

All tables in the content areas.

content_figures: list[modm_data.pdf2html.figure.Figure] View Source

415    @property
416    def content_figures(self) -> list[Figure]:
417        """All figures in the content areas."""
418        return [o for o in self.content_graphics if isinstance(o, Figure)]

All figures in the content areas.

modm_data.pdf2html.page

Parameters

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Inherited Members