diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 82188e6b..91a7e39e 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): text_stream = self.path_or_stream.getvalue().decode("utf-8") self.lines = text_stream.split("\n") if isinstance(self.path_or_stream, Path): - with open(self.path_or_stream, "r", encoding="utf-8") as f: + with open(self.path_or_stream, encoding="utf-8") as f: self.lines = f.readlines() self.valid = True @@ -75,7 +75,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return doc - def _parse(self, doc: DoclingDocument): + def _parse(self, doc: DoclingDocument): # noqa: C901 """ Main function that orchestrates the parsing by yielding components: title, section headers, text, lists, and tables. @@ -95,7 +95,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # indents: dict[int, Union[DocItem, GroupItem, None]] = {} indents: dict[int, Union[GroupItem, None]] = {} - for i in range(0, 10): + for i in range(10): parents[i] = None indents[i] = None diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py index 9159bd47..94d37d03 100644 --- a/docling/backend/csv_backend.py +++ b/docling/backend/csv_backend.py @@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend): head = self.content.readline() dialect = csv.Sniffer().sniff(head, ",;\t|:") _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') - if not dialect.delimiter in {",", ";", "\t", "|", ":"}: + if dialect.delimiter not in {",", ";", "\t", "|", ":"}: raise RuntimeError( f"Cannot convert csv with unknown delimiter {dialect.delimiter}." ) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 081efa6f..33e7792d 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -1,8 +1,9 @@ import logging import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import Iterable, List, Optional, Union +from typing import List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin, Size diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 1e6ea9c1..6c12b663 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -1,8 +1,9 @@ import logging import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index 232081fd..3e59f123 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -1,14 +1,14 @@ import logging -import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument -from PIL import Image, ImageDraw +from PIL import Image from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py index 03234788..13486478 100644 --- a/docling/backend/docx/latex/latex_dict.py +++ b/docling/backend/docx/latex/latex_dict.py @@ -1,12 +1,8 @@ -# -*- coding: utf-8 -*- - """ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py On 23/01/2025 """ -from __future__ import unicode_literals - CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") BLANK = "" @@ -79,7 +75,6 @@ CHR_BO = { } T = { - "\u2192": "\\rightarrow ", # Greek letters "\U0001d6fc": "\\alpha ", "\U0001d6fd": "\\beta ", diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index 1289ffbd..d1e5453d 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -76,7 +76,7 @@ def get_val(key, default=None, store=CHR): return default -class Tag2Method(object): +class Tag2Method: def call_method(self, elm, stag=None): getmethod = self.tag2meth.get if stag is None: @@ -157,7 +157,7 @@ class Pr(Tag2Method): def do_common(self, elm): stag = elm.tag.replace(OMML_NS, "") if stag in self.__val_tags: - t = elm.get("{0}val".format(OMML_NS)) + t = elm.get(f"{OMML_NS}val") self.__innerdict[stag] = t return None @@ -246,7 +246,6 @@ class oMath2Latex(Tag2Method): """ the Pre-Sub-Superscript object -- Not support yet """ - pass def do_sub(self, elm): text = self.process_children(elm) @@ -329,7 +328,7 @@ class oMath2Latex(Tag2Method): t_dict = self.process_children_dict(elm, include=("e", "lim")) latex_s = LIM_FUNC.get(t_dict["e"]) if not latex_s: - raise NotSupport("Not support lim %s" % t_dict["e"]) + raise RuntimeError("Not support lim %s" % t_dict["e"]) else: return latex_s.format(lim=t_dict.get("lim")) @@ -411,7 +410,7 @@ class oMath2Latex(Tag2Method): """ _str = [] _base_str = [] - found_text = elm.findtext("./{0}t".format(OMML_NS)) + found_text = elm.findtext(f"./{OMML_NS}t") if found_text: for s in found_text: out_latex_str = self.process_unicode(s) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 3534d827..83226d7e 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.max_levels = 10 self.level = 0 self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} - for i in range(0, self.max_levels): + for i in range(self.max_levels): self.parents[i] = None try: @@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: _log.error( - f"Error processing child from tag {tag.name}: {repr(exc_child)}" + f"Error processing child from tag {tag.name}: {exc_child!r}" ) raise exc_child elif isinstance(element, NavigableString) and not isinstance( @@ -357,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): marker = "" enumerated = False if parent_label == GroupLabel.ORDERED_LIST: - marker = f"{str(index_in_list)}." + marker = f"{index_in_list!s}." enumerated = True doc.add_list_item( text=text, diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 2abe5bae..f8a97a73 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # otherwise they represent emphasis (bold or italic) self.markdown = self._shorten_underscore_sequences(text_stream) if isinstance(self.path_or_stream, Path): - with open(self.path_or_stream, "r", encoding="utf-8") as f: + with open(self.path_or_stream, encoding="utf-8") as f: md_content = f.read() # remove invalid sequences # very long sequences of underscores will lead to unnecessary long processing times. @@ -235,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): if has_non_empty_list_items: label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST parent_item = doc.add_group( - label=label, name=f"list", parent=parent_item + label=label, name="list", parent=parent_item ) elif ( @@ -319,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._html_blocks += 1 self._process_inline_text(parent_item, doc) self._close_table(doc) - _log.debug("HTML Block: {}".format(element)) + _log.debug(f"HTML Block: {element}") if ( len(element.body) > 0 ): # If Marko doesn't return any content for HTML block, skip it @@ -331,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): else: if not isinstance(element, str): self._close_table(doc) - _log.debug("Some other element: {}".format(element)) + _log.debug(f"Some other element: {element}") processed_block_types = ( marko.block.Heading, diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 2de0da1b..c0ef2bce 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -120,7 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB return prov - def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): + def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901 is_a_list = False is_list_group_created = False enum_list_item_value = 0 @@ -243,7 +243,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB enum_marker = str(enum_list_item_value) + "." if not is_list_group_created: new_list = doc.add_group( - label=list_label, name=f"list", parent=parent_slide + label=list_label, name="list", parent=parent_slide ) is_list_group_created = True doc.add_list_item( @@ -372,7 +372,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB max_levels = 10 parents = {} # type: ignore - for i in range(0, max_levels): + for i in range(max_levels): parents[i] = None # Loop through each slide diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5915c0a5..c77b0783 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" ) if cell is None or cell._tc in cell_set: - _log.debug(f" skipped since repeated content") + _log.debug(" skipped since repeated content") col_idx += cell.grid_span continue else: @@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): image=ImageRef.from_pil(image=pil_image, dpi=72), caption=None, ) - except (UnidentifiedImageError, OSError) as e: + except (UnidentifiedImageError, OSError): _log.warning("Warning: image cannot be loaded by Pillow") doc.add_picture( parent=self.parents[level - 1], diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index cfecc7e6..3d07578b 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import Iterable, Optional, Set, Union +from typing import Optional, Set, Union from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc.page import SegmentedPdfPage, TextCell diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index d8e9a2ce..67e1f059 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -1,8 +1,9 @@ import logging import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c @@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend): self.valid = True # No better way to tell from pypdfium. try: self._ppage: pdfium.PdfPage = pdfium_doc[page_no] - except PdfiumError as e: + except PdfiumError: _log.info( f"An exception occurred when loading page {page_no} of document {document_hash}.", exc_info=True, diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py index 06cbb2f4..17271fb8 100755 --- a/docling/backend/xml/jats_backend.py +++ b/docling/backend/xml/jats_backend.py @@ -348,7 +348,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): return - def _parse_element_citation(self, node: etree._Element) -> str: + def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901 citation: Citation = { "author_names": "", "title": "", @@ -439,7 +439,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip() if len(node.xpath("lpage")) > 0: citation["page"] += ( - "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() + "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001 ) # Flatten the citation to string @@ -594,9 +594,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): try: self._add_table(doc, parent, table) - except Exception as e: - _log.warning(f"Skipping unsupported table in {str(self.file)}") - pass + except Exception: + _log.warning(f"Skipping unsupported table in {self.file!s}") return diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index a3e04081..29b41846 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -162,7 +162,6 @@ class PatentUspto(ABC): Returns: The patent parsed as a docling document. """ - pass class PatentUsptoIce(PatentUspto): @@ -264,7 +263,7 @@ class PatentUsptoIce(PatentUspto): self.style_html = HtmlEntity() @override - def startElement(self, tag, attributes): # noqa: N802 + def startElement(self, tag, attributes): """Signal the start of an element. Args: @@ -280,7 +279,7 @@ class PatentUsptoIce(PatentUspto): self._start_registered_elements(tag, attributes) @override - def skippedEntity(self, name): # noqa: N802 + def skippedEntity(self, name): """Receive notification of a skipped entity. HTML entities will be skipped by the parser. This method will unescape them @@ -314,7 +313,7 @@ class PatentUsptoIce(PatentUspto): self.text += unescaped @override - def endElement(self, tag): # noqa: N802 + def endElement(self, tag): """Signal the end of an element. Args: @@ -602,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto): self.style_html = HtmlEntity() @override - def startElement(self, tag, attributes): # noqa: N802 + def startElement(self, tag, attributes): """Signal the start of an element. Args: @@ -615,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto): self._start_registered_elements(tag, attributes) @override - def skippedEntity(self, name): # noqa: N802 + def skippedEntity(self, name): """Receive notification of a skipped entity. HTML entities will be skipped by the parser. This method will unescape them @@ -649,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto): self.text += unescaped @override - def endElement(self, tag): # noqa: N802 + def endElement(self, tag): """Signal the end of an element. Args: @@ -690,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto): if tag in [member.value for member in self.Element]: if ( tag == self.Element.HEADING.value - and not self.Element.SDOCL.value in self.property + and self.Element.SDOCL.value not in self.property ): level_attr: str = attributes.get("LVL", "") new_level: int = int(level_attr) if level_attr.isnumeric() else 1 @@ -742,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto): # headers except claims statement elif ( self.Element.HEADING.value in self.property - and not self.Element.SDOCL.value in self.property + and self.Element.SDOCL.value not in self.property and text.strip() ): self.parents[self.level + 1] = self.doc.add_heading( @@ -1163,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto): self.style_html = HtmlEntity() @override - def startElement(self, tag, attributes): # noqa: N802 + def startElement(self, tag, attributes): """Signal the start of an element. Args: @@ -1176,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto): self._start_registered_elements(tag, attributes) @override - def skippedEntity(self, name): # noqa: N802 + def skippedEntity(self, name): """Receive notification of a skipped entity. HTML entities will be skipped by the parser. This method will unescape them @@ -1210,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto): self.text += unescaped @override - def endElement(self, tag): # noqa: N802 + def endElement(self, tag): """Signal the end of an element. Args: @@ -1526,7 +1525,7 @@ class XmlTable: return ncols_max - def _parse_table(self, table: Tag) -> TableData: + def _parse_table(self, table: Tag) -> TableData: # noqa: C901 """Parse the content of a table tag. Args: @@ -1721,7 +1720,7 @@ class HtmlEntity: "0": "⁰", "+": "⁺", "-": "⁻", - "−": "⁻", + "−": "⁻", # noqa: RUF001 "=": "⁼", "(": "⁽", ")": "⁾", @@ -1745,7 +1744,7 @@ class HtmlEntity: "0": "₀", "+": "₊", "-": "₋", - "−": "₋", + "−": "₋", # noqa: RUF001 "=": "₌", "(": "₍", ")": "₎", diff --git a/docling/cli/main.py b/docling/cli/main.py index 3cb521ad..f60f11cb 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -6,8 +6,9 @@ import sys import tempfile import time import warnings +from collections.abc import Iterable from pathlib import Path -from typing import Annotated, Dict, Iterable, List, Optional, Type +from typing import Annotated, Dict, List, Optional, Type import rich.table import typer @@ -288,7 +289,7 @@ def convert( ..., help=( f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: " - f"{', '.join((o.value for o in ocr_engines_enum_internal))}. " + f"{', '.join(o.value for o in ocr_engines_enum_internal)}. " f"Use the option --show-external-plugins to see the options allowed with external plugins." ), ), diff --git a/docling/cli/models.py b/docling/cli/models.py index 7bc313c1..80672714 100644 --- a/docling/cli/models.py +++ b/docling/cli/models.py @@ -62,7 +62,7 @@ def download( models: Annotated[ Optional[list[_AvailableModels]], typer.Argument( - help=f"Models to download (default behavior: a predefined set of models will be downloaded).", + help="Models to download (default behavior: a predefined set of models will be downloaded).", ), ] = None, all: Annotated[ diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index b1daa482..db013f50 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -10,7 +10,7 @@ from docling_core.types.doc import ( TableCell, ) from docling_core.types.doc.page import SegmentedPdfPage, TextCell -from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location +from docling_core.types.io import ( DocumentStream, ) from PIL.Image import Image @@ -243,7 +243,7 @@ class Page(BaseModel): if self._backend is None: return self._image_cache.get(scale, None) - if not scale in self._image_cache: + if scale not in self._image_cache: if cropbox is None: self._image_cache[scale] = self._backend.get_page_image(scale=scale) else: diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index b925404c..14ddd2fd 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,13 +1,13 @@ import csv import logging import re +from collections.abc import Iterable from enum import Enum from io import BytesIO from pathlib import Path, PurePath from typing import ( TYPE_CHECKING, Dict, - Iterable, List, Literal, Optional, @@ -18,31 +18,9 @@ from typing import ( import filetype from docling_core.types.doc import ( - DocItem, DocItemLabel, DoclingDocument, - PictureItem, - SectionHeaderItem, - TableItem, - TextItem, ) -from docling_core.types.doc.document import ListItem -from docling_core.types.legacy_doc.base import ( - BaseText, - Figure, - GlmTableCell, - PageDimensions, - PageReference, - Prov, - Ref, -) -from docling_core.types.legacy_doc.base import Table as DsSchemaTable -from docling_core.types.legacy_doc.base import TableCell -from docling_core.types.legacy_doc.document import ( - CCSDocumentDescription as DsDocumentDescription, -) -from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject -from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.utils.file import resolve_source_to_stream from docling_core.utils.legacy import docling_document_to_legacy from pydantic import BaseModel @@ -65,7 +43,7 @@ from docling.datamodel.base_models import ( ) from docling.datamodel.settings import DocumentLimits from docling.utils.profiling import ProfilingItem -from docling.utils.utils import create_file_hash, create_hash +from docling.utils.utils import create_file_hash if TYPE_CHECKING: from docling.document_converter import FormatOption diff --git a/docling/document_converter.py b/docling/document_converter.py index 7489f49a..4e37f409 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,11 +1,11 @@ import hashlib import logging -import math import sys import time +from collections.abc import Iterable, Iterator from functools import partial from pathlib import Path -from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Tuple, Type, Union from pydantic import BaseModel, ConfigDict, model_validator, validate_call @@ -254,7 +254,7 @@ class DocumentConverter: if not had_result and raises_on_error: raise ConversionError( - f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." + "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." ) def _convert( @@ -266,7 +266,7 @@ class DocumentConverter: conv_input.docs(self.format_to_options), settings.perf.doc_batch_size, # pass format_options ): - _log.info(f"Going to convert document batch...") + _log.info("Going to convert document batch...") # parallel processing only within input_batch # with ThreadPoolExecutor( diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index 4fbefcc4..f7e82b5f 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -1,4 +1,4 @@ -from typing import Iterable +from collections.abc import Iterable from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult diff --git a/docling/models/base_model.py b/docling/models/base_model.py index 95760cdf..04df812d 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod -from typing import Any, Generic, Iterable, Optional, Protocol, Type +from collections.abc import Iterable +from typing import Generic, Optional, Protocol, Type from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem from typing_extensions import TypeVar diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index c8235808..9f05aed3 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -1,12 +1,12 @@ import copy import logging from abc import abstractmethod +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional, Type +from typing import List, Optional, Type import numpy as np from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell from PIL import Image, ImageDraw from rtree import index from scipy.ndimage import binary_dilation, find_objects, label diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index 10426c24..bf747c56 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -1,7 +1,8 @@ import re from collections import Counter +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Literal, Optional, Tuple, Union +from typing import List, Literal, Optional, Tuple, Union import numpy as np from docling_core.types.doc import ( diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index f51d7350..6a57a74d 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Literal, Optional, Tuple, Union +from typing import List, Literal, Optional, Union import numpy as np from docling_core.types.doc import ( diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 5e22e1d3..c714af85 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -1,8 +1,9 @@ import logging import warnings import zipfile +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional, Type +from typing import List, Optional, Type import numpy from docling_core.types.doc import BoundingBox, CoordOrigin @@ -98,8 +99,10 @@ class EasyOcrModel(BaseOcrModel): progress: bool = False, ) -> Path: # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py - from easyocr.config import detection_models as det_models_dict - from easyocr.config import recognition_models as rec_models_dict + from easyocr.config import ( + detection_models as det_models_dict, + recognition_models as rec_models_dict, + ) if local_dir is None: local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder diff --git a/docling/models/factories/__init__.py b/docling/models/factories/__init__.py index 9a3308e1..a6adb3f2 100644 --- a/docling/models/factories/__init__.py +++ b/docling/models/factories/__init__.py @@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import ( logger = logging.getLogger(__name__) -@lru_cache() +@lru_cache def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory: factory = OcrFactory() factory.load_from_plugins(allow_external_plugins=allow_external_plugins) @@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory: return factory -@lru_cache() +@lru_cache def get_picture_description_factory( allow_external_plugins: bool = False, ) -> PictureDescriptionFactory: diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py index 571f85a0..8516cee5 100644 --- a/docling/models/hf_mlx_model.py +++ b/docling/models/hf_mlx_model.py @@ -1,18 +1,16 @@ import logging import time +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional +from typing import Optional from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, AcceleratorOptions, HuggingFaceVlmOptions, ) -from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel -from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index 7b4771d8..3d203b5e 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -1,16 +1,15 @@ import logging import time +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional +from typing import Optional from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, AcceleratorOptions, HuggingFaceVlmOptions, ) -from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder @@ -41,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel): device = decide_device(accelerator_options.device) self.device = device - _log.debug("Available device for HuggingFace VLM: {}".format(device)) + _log.debug(f"Available device for HuggingFace VLM: {device}") repo_cache_folder = vlm_options.repo_id.replace("/", "--") diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index a61f7726..ae373012 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -1,8 +1,9 @@ import copy import logging import warnings +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Union +from typing import Optional from docling_core.types.doc import DocItemLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py index c9c778f0..a8ff55b8 100644 --- a/docling/models/ocr_mac_model.py +++ b/docling/models/ocr_mac_model.py @@ -1,8 +1,9 @@ import logging import sys import tempfile +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Tuple, Type +from typing import Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell @@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel): if self.enabled: if "darwin" != sys.platform: - raise RuntimeError(f"OcrMac is only supported on Mac.") + raise RuntimeError("OcrMac is only supported on Mac.") install_errmsg = ( "ocrmac is not correctly installed. " "Please install it via `pip install ocrmac` to use this OCR engine. " diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 4c43560e..7153181e 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -1,6 +1,7 @@ import logging import re -from typing import Iterable, List +from collections.abc import Iterable +from typing import List from pydantic import BaseModel @@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel): sanitized_text = "".join(lines) # Text normalization - sanitized_text = sanitized_text.replace("⁄", "/") - sanitized_text = sanitized_text.replace("’", "'") - sanitized_text = sanitized_text.replace("‘", "'") + sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001 + sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001 + sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001 sanitized_text = sanitized_text.replace("“", '"') sanitized_text = sanitized_text.replace("”", '"') sanitized_text = sanitized_text.replace("•", "·") diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index d1b29e38..b45b189e 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional +from typing import Optional from PIL import ImageDraw from pydantic import BaseModel diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index 1aa73518..44bb5e21 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type, Union +from typing import Optional, Type, Union from PIL import Image diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py index 96169227..2f6e6479 100644 --- a/docling/models/picture_description_base_model.py +++ b/docling/models/picture_description_base_model.py @@ -1,12 +1,11 @@ -import logging from abc import abstractmethod +from collections.abc import Iterable from pathlib import Path -from typing import Any, Iterable, List, Optional, Type, Union +from typing import List, Optional, Type, Union from docling_core.types.doc import ( DoclingDocument, NodeItem, - PictureClassificationClass, PictureItem, ) from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py index 907c1a43..374f575d 100644 --- a/docling/models/picture_description_vlm_model.py +++ b/docling/models/picture_description_vlm_model.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type, Union +from typing import Optional, Type, Union from PIL import Image diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py index 77190cfe..2c7f4357 100644 --- a/docling/models/rapid_ocr_model.py +++ b/docling/models/rapid_ocr_model.py @@ -1,6 +1,7 @@ import logging +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type +from typing import Optional, Type import numpy from docling_core.types.doc import BoundingBox, CoordOrigin diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py index a40bc5a9..287bcd67 100644 --- a/docling/models/readingorder_model.py +++ b/docling/models/readingorder_model.py @@ -1,12 +1,7 @@ -import copy -import random from pathlib import Path from typing import Dict, List from docling_core.types.doc import ( - BoundingBox, - CoordOrigin, - DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, @@ -17,13 +12,10 @@ from docling_core.types.doc import ( TableData, ) from docling_core.types.doc.document import ContentLayer -from docling_core.types.legacy_doc.base import Ref -from docling_core.types.legacy_doc.document import BaseText from docling_ibm_models.reading_order.reading_order_rb import ( PageElement as ReadingOrderPageElement, + ReadingOrderPredictor, ) -from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor -from PIL import ImageDraw from pydantic import BaseModel, ConfigDict from docling.datamodel.base_models import ( @@ -35,7 +27,6 @@ from docling.datamodel.base_models import ( TextElement, ) from docling.datamodel.document import ConversionResult -from docling.datamodel.settings import settings from docling.utils.profiling import ProfilingScope, TimeRecorder diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 3e81e288..44579b9f 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -1,13 +1,13 @@ import copy import warnings +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Union +from typing import Optional import numpy from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_core.types.doc.page import ( BoundingRectangle, - SegmentedPdfPage, TextCellUnit, ) from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 365fdae7..156045e9 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -3,9 +3,10 @@ import io import logging import os import tempfile +from collections.abc import Iterable from pathlib import Path from subprocess import DEVNULL, PIPE, Popen -from typing import Iterable, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 84a02a3a..ef8c806f 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,6 +1,7 @@ import logging +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type +from typing import Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index b278a0c7..29475d68 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -3,9 +3,10 @@ import logging import time import traceback from abc import ABC, abstractmethod -from typing import Any, Callable, Iterable, List +from collections.abc import Iterable +from typing import Any, Callable, List -from docling_core.types.doc import DoclingDocument, NodeItem +from docling_core.types.doc import NodeItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend @@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. total_elapsed_time = 0.0 with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): - for i in range(0, conv_res.input.page_count): + for i in range(conv_res.input.page_count): start_page, end_page = conv_res.input.limits.page_range if (start_page - 1) <= i <= (end_page - 1): conv_res.pages.append(Page(page_no=i)) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index ae2d918d..fe93c6c5 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -1,5 +1,4 @@ import logging -import sys import warnings from pathlib import Path from typing import Optional, cast diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index b8892a49..9a7b51eb 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,5 +1,4 @@ import logging -import warnings from io import BytesIO from pathlib import Path from typing import List, Optional, Union, cast diff --git a/docling/utils/export.py b/docling/utils/export.py index 4f3ac6fe..debf09ff 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -1,8 +1,8 @@ import logging -from typing import Any, Dict, Iterable, List, Tuple, Union +from collections.abc import Iterable +from typing import Any, Dict, List, Tuple, Union from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import TextCell from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table from docling.datamodel.document import ConversionResult, Page diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py index c3c43536..46ac0bce 100644 --- a/docling/utils/glm_utils.py +++ b/docling/utils/glm_utils.py @@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]: return unique_objects -def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: +def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901 origin = DocumentOrigin( mimetype="application/pdf", filename=doc_glm["file-info"]["filename"], diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index 66ce9aec..4c25655d 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -18,7 +18,7 @@ class UnionFind: def __init__(self, elements): self.parent = {elem: elem for elem in elements} - self.rank = {elem: 0 for elem in elements} + self.rank = dict.fromkeys(elements, 0) def find(self, x): if self.parent[x] != x: diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py index 694fe042..6a1eb838 100644 --- a/docling/utils/model_downloader.py +++ b/docling/utils/model_downloader.py @@ -37,7 +37,7 @@ def download_models( output_dir.mkdir(exist_ok=True, parents=True) if with_layout: - _log.info(f"Downloading layout model...") + _log.info("Downloading layout model...") LayoutModel.download_models( local_dir=output_dir / LayoutModel._model_repo_folder, force=force, @@ -45,7 +45,7 @@ def download_models( ) if with_tableformer: - _log.info(f"Downloading tableformer model...") + _log.info("Downloading tableformer model...") TableStructureModel.download_models( local_dir=output_dir / TableStructureModel._model_repo_folder, force=force, @@ -53,7 +53,7 @@ def download_models( ) if with_picture_classifier: - _log.info(f"Downloading picture classifier model...") + _log.info("Downloading picture classifier model...") DocumentPictureClassifier.download_models( local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, force=force, @@ -61,7 +61,7 @@ def download_models( ) if with_code_formula: - _log.info(f"Downloading code formula model...") + _log.info("Downloading code formula model...") CodeFormulaModel.download_models( local_dir=output_dir / CodeFormulaModel._model_repo_folder, force=force, @@ -69,7 +69,7 @@ def download_models( ) if with_smolvlm: - _log.info(f"Downloading SmolVlm model...") + _log.info("Downloading SmolVlm model...") PictureDescriptionVlmModel.download_models( repo_id=smolvlm_picture_description.repo_id, local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, @@ -78,7 +78,7 @@ def download_models( ) if with_granite_vision: - _log.info(f"Downloading Granite Vision model...") + _log.info("Downloading Granite Vision model...") PictureDescriptionVlmModel.download_models( repo_id=granite_picture_description.repo_id, local_dir=output_dir / granite_picture_description.repo_cache_folder, @@ -87,7 +87,7 @@ def download_models( ) if with_easyocr: - _log.info(f"Downloading easyocr models...") + _log.info("Downloading easyocr models...") EasyOcrModel.download_models( local_dir=output_dir / EasyOcrModel._model_repo_folder, force=force, diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb index 091f116d..3af38b4f 100644 --- a/docs/examples/backend_xml_rag.ipynb +++ b/docs/examples/backend_xml_rag.ipynb @@ -383,7 +383,7 @@ "\n", "print(f\"Downloading {url}...\")\n", "buf = BytesIO(requests.get(url).content)\n", - "print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n", + "print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n", "with zipfile.ZipFile(buf) as zf:\n", " res = zf.testzip()\n", " if res:\n", diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index fd68e62a..25eb2bac 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -1,8 +1,8 @@ import json import logging import time +from collections.abc import Iterable from pathlib import Path -from typing import Iterable import yaml from docling_core.types.doc import ImageRefMode @@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption _log = logging.getLogger(__name__) diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index ddc19217..3b8ae6df 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -3,7 +3,6 @@ import logging import time from pathlib import Path -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( AcceleratorDevice, @@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import ( PdfPipelineOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.models.ocr_mac_model import OcrMacOptions -from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions -from docling.models.tesseract_ocr_model import TesseractOcrOptions _log = logging.getLogger(__name__) diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index 1ebfc46c..e9972d02 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -3,8 +3,8 @@ # It does not run the actual formula understanding model. import logging +from collections.abc import Iterable from pathlib import Path -from typing import Iterable from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py index 9991afe9..9e3d3067 100644 --- a/docs/examples/develop_picture_enrichment.py +++ b/docs/examples/develop_picture_enrichment.py @@ -3,8 +3,9 @@ # It does not run the actual picture classifier model. import logging +from collections.abc import Iterable from pathlib import Path -from typing import Any, Iterable +from typing import Any from docling_core.types.doc import ( DoclingDocument, diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py index c2186661..8ed14a70 100644 --- a/docs/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -4,7 +4,7 @@ from pathlib import Path from docling_core.types.doc import ImageRefMode, PictureItem, TableItem -from docling.datamodel.base_models import FigureElement, InputFormat, Table +from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py index 8390d5fc..5525e87e 100644 --- a/docs/examples/full_page_ocr.py +++ b/docs/examples/full_page_ocr.py @@ -1,14 +1,9 @@ from pathlib import Path -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - EasyOcrOptions, - OcrMacOptions, PdfPipelineOptions, - RapidOcrOptions, TesseractCliOcrOptions, - TesseractOcrOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb index 2f6d9457..c8a8f42e 100644 --- a/docs/examples/hybrid_chunking.ipynb +++ b/docs/examples/hybrid_chunking.ipynb @@ -153,10 +153,10 @@ "source": [ "for i, chunk in enumerate(chunk_iter):\n", " print(f\"=== {i} ===\")\n", - " print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n", + " print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n", "\n", " enriched_text = chunker.serialize(chunk=chunk)\n", - " print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n", + " print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n", "\n", " print()" ] @@ -353,11 +353,11 @@ "for i, chunk in enumerate(chunks):\n", " print(f\"=== {i} ===\")\n", " txt_tokens = len(tokenizer.tokenize(chunk.text))\n", - " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", + " print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n", "\n", " ser_txt = chunker.serialize(chunk=chunk)\n", " ser_tokens = len(tokenizer.tokenize(ser_txt))\n", - " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n", + " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n", "\n", " print()" ] diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 6a15fe42..5211fa44 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -2,17 +2,11 @@ import json import time from pathlib import Path -import yaml - from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - AcceleratorDevice, VlmPipelineOptions, - granite_vision_vlm_conversion_options, - smoldocling_vlm_conversion_options, smoldocling_vlm_mlx_conversion_options, ) -from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True) for source in sources: start_time = time.time() print("================================================") - print("Processing... {}".format(source)) + print(f"Processing... {source}") print("================================================") print("") @@ -77,7 +71,7 @@ for source in sources: print(page.predictions.vlm_response.text) res.document.save_as_html( - filename=Path("{}/{}.html".format(out_path, res.input.file.stem)), + filename=Path(f"{out_path}/{res.input.file.stem}.html"), image_mode=ImageRefMode.REFERENCED, labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], ) diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb index feeb00ba..a40a73aa 100644 --- a/docs/examples/pictures_description.ipynb +++ b/docs/examples/pictures_description.ipynb @@ -144,7 +144,7 @@ "for pic in doc.pictures[:5]:\n", " html_item = (\n", " f\"
{pic.self_ref}
{pic.self_ref}