apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-04-14 14:48:04 +02:00
parent d74e407526
commit 73cec158c6
84 changed files with 172 additions and 225 deletions

View File

@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.lines = text_stream.split("\n")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, encoding="utf-8") as f:
self.lines = f.readlines()
self.valid = True
@ -75,7 +75,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return doc
def _parse(self, doc: DoclingDocument):
def _parse(self, doc: DoclingDocument): # noqa: C901
"""
Main function that orchestrates the parsing by yielding components:
title, section headers, text, lists, and tables.
@ -95,7 +95,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {}
for i in range(0, 10):
for i in range(10):
parents[i] = None
indents[i] = None

View File

@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
head = self.content.readline()
dialect = csv.Sniffer().sniff(head, ",;\t|:")
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
)

View File

@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size

View File

@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin

View File

@ -1,14 +1,14 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from PIL import Image, ImageDraw
from PIL import Image
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend

View File

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*-
"""
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
On 23/01/2025
"""
from __future__ import unicode_literals
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
BLANK = ""
@ -79,7 +75,6 @@ CHR_BO = {
}
T = {
"\u2192": "\\rightarrow ",
# Greek letters
"\U0001d6fc": "\\alpha ",
"\U0001d6fd": "\\beta ",

View File

@ -76,7 +76,7 @@ def get_val(key, default=None, store=CHR):
return default
class Tag2Method(object):
class Tag2Method:
def call_method(self, elm, stag=None):
getmethod = self.tag2meth.get
if stag is None:
@ -157,7 +157,7 @@ class Pr(Tag2Method):
def do_common(self, elm):
stag = elm.tag.replace(OMML_NS, "")
if stag in self.__val_tags:
t = elm.get("{0}val".format(OMML_NS))
t = elm.get(f"{OMML_NS}val")
self.__innerdict[stag] = t
return None
@ -246,7 +246,6 @@ class oMath2Latex(Tag2Method):
"""
the Pre-Sub-Superscript object -- Not support yet
"""
pass
def do_sub(self, elm):
text = self.process_children(elm)
@ -329,7 +328,7 @@ class oMath2Latex(Tag2Method):
t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s:
raise NotSupport("Not support lim %s" % t_dict["e"])
raise RuntimeError("Not support lim %s" % t_dict["e"])
else:
return latex_s.format(lim=t_dict.get("lim"))
@ -411,7 +410,7 @@ class oMath2Latex(Tag2Method):
"""
_str = []
_base_str = []
found_text = elm.findtext("./{0}t".format(OMML_NS))
found_text = elm.findtext(f"./{OMML_NS}t")
if found_text:
for s in found_text:
out_latex_str = self.process_unicode(s)

View File

@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.max_levels = 10
self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
for i in range(0, self.max_levels):
for i in range(self.max_levels):
self.parents[i] = None
try:
@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
_log.error(
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
f"Error processing child from tag {tag.name}: {exc_child!r}"
)
raise exc_child
elif isinstance(element, NavigableString) and not isinstance(
@ -357,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
enumerated = False
if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
marker = f"{index_in_list!s}."
enumerated = True
doc.add_list_item(
text=text,

View File

@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# otherwise they represent emphasis (bold or italic)
self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, encoding="utf-8") as f:
md_content = f.read()
# remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times.
@ -235,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_item = doc.add_group(
label=label, name=f"list", parent=parent_item
label=label, name="list", parent=parent_item
)
elif (
@ -319,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug("HTML Block: {}".format(element))
_log.debug(f"HTML Block: {element}")
if (
len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it
@ -331,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else:
if not isinstance(element, str):
self._close_table(doc)
_log.debug("Some other element: {}".format(element))
_log.debug(f"Some other element: {element}")
processed_block_types = (
marko.block.Heading,

View File

@ -120,7 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
is_a_list = False
is_list_group_created = False
enum_list_item_value = 0
@ -243,7 +243,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide
label=list_label, name="list", parent=parent_slide
)
is_list_group_created = True
doc.add_list_item(
@ -372,7 +372,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
max_levels = 10
parents = {} # type: ignore
for i in range(0, max_levels):
for i in range(max_levels):
parents[i] = None
# Loop through each slide

View File

@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
)
if cell is None or cell._tc in cell_set:
_log.debug(f" skipped since repeated content")
_log.debug(" skipped since repeated content")
col_idx += cell.grid_span
continue
else:
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except (UnidentifiedImageError, OSError) as e:
except (UnidentifiedImageError, OSError):
_log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture(
parent=self.parents[level - 1],

View File

@ -1,7 +1,8 @@
from abc import ABC, abstractmethod
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Set, Union
from typing import Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.page import SegmentedPdfPage, TextCell

View File

@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e:
except PdfiumError:
_log.info(
f"An exception occurred when loading page {page_no} of document {document_hash}.",
exc_info=True,

View File

@ -348,7 +348,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
return
def _parse_element_citation(self, node: etree._Element) -> str:
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
citation: Citation = {
"author_names": "",
"title": "",
@ -439,7 +439,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0:
citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
)
# Flatten the citation to string
@ -594,9 +594,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
try:
self._add_table(doc, parent, table)
except Exception as e:
_log.warning(f"Skipping unsupported table in {str(self.file)}")
pass
except Exception:
_log.warning(f"Skipping unsupported table in {self.file!s}")
return

View File

@ -162,7 +162,6 @@ class PatentUspto(ABC):
Returns:
The patent parsed as a docling document.
"""
pass
class PatentUsptoIce(PatentUspto):
@ -264,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@ -280,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@ -314,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@ -602,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@ -615,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@ -649,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@ -690,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
if tag in [member.value for member in self.Element]:
if (
tag == self.Element.HEADING.value
and not self.Element.SDOCL.value in self.property
and self.Element.SDOCL.value not in self.property
):
level_attr: str = attributes.get("LVL", "")
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@ -742,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
# headers except claims statement
elif (
self.Element.HEADING.value in self.property
and not self.Element.SDOCL.value in self.property
and self.Element.SDOCL.value not in self.property
and text.strip()
):
self.parents[self.level + 1] = self.doc.add_heading(
@ -1163,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@ -1176,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@ -1210,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@ -1526,7 +1525,7 @@ class XmlTable:
return ncols_max
def _parse_table(self, table: Tag) -> TableData:
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
"""Parse the content of a table tag.
Args:
@ -1721,7 +1720,7 @@ class HtmlEntity:
"0": "&#8304;",
"+": "&#8314;",
"-": "&#8315;",
"": "&#8315;",
"": "&#8315;", # noqa: RUF001
"=": "&#8316;",
"(": "&#8317;",
")": "&#8318;",
@ -1745,7 +1744,7 @@ class HtmlEntity:
"0": "&#8320;",
"+": "&#8330;",
"-": "&#8331;",
"": "&#8331;",
"": "&#8331;", # noqa: RUF001
"=": "&#8332;",
"(": "&#8333;",
")": "&#8334;",

View File

@ -6,8 +6,9 @@ import sys
import tempfile
import time
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type
from typing import Annotated, Dict, List, Optional, Type
import rich.table
import typer
@ -288,7 +289,7 @@ def convert(
...,
help=(
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
f"Use the option --show-external-plugins to see the options allowed with external plugins."
),
),

View File

@ -62,7 +62,7 @@ def download(
models: Annotated[
Optional[list[_AvailableModels]],
typer.Argument(
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
help="Models to download (default behavior: a predefined set of models will be downloaded).",
),
] = None,
all: Annotated[

View File

@ -10,7 +10,7 @@ from docling_core.types.doc import (
TableCell,
)
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
from docling_core.types.io import (
DocumentStream,
)
from PIL.Image import Image
@ -243,7 +243,7 @@ class Page(BaseModel):
if self._backend is None:
return self._image_cache.get(scale, None)
if not scale in self._image_cache:
if scale not in self._image_cache:
if cropbox is None:
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
else:

View File

@ -1,13 +1,13 @@
import csv
import logging
import re
from collections.abc import Iterable
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import (
TYPE_CHECKING,
Dict,
Iterable,
List,
Literal,
Optional,
@ -18,31 +18,9 @@ from typing import (
import filetype
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
PictureItem,
SectionHeaderItem,
TableItem,
TextItem,
)
from docling_core.types.doc.document import ListItem
from docling_core.types.legacy_doc.base import (
BaseText,
Figure,
GlmTableCell,
PageDimensions,
PageReference,
Prov,
Ref,
)
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
from docling_core.types.legacy_doc.base import TableCell
from docling_core.types.legacy_doc.document import (
CCSDocumentDescription as DsDocumentDescription,
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
@ -65,7 +43,7 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash
from docling.utils.utils import create_file_hash
if TYPE_CHECKING:
from docling.document_converter import FormatOption

View File

@ -1,11 +1,11 @@
import hashlib
import logging
import math
import sys
import time
from collections.abc import Iterable, Iterator
from functools import partial
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
from typing import Dict, List, Optional, Tuple, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@ -254,7 +254,7 @@ class DocumentConverter:
if not had_result and raises_on_error:
raise ConversionError(
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
)
def _convert(
@ -266,7 +266,7 @@ class DocumentConverter:
conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options
):
_log.info(f"Going to convert document batch...")
_log.info("Going to convert document batch...")
# parallel processing only within input_batch
# with ThreadPoolExecutor(

View File

@ -1,4 +1,4 @@
from typing import Iterable
from collections.abc import Iterable
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult

View File

@ -1,5 +1,6 @@
from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional, Protocol, Type
from collections.abc import Iterable
from typing import Generic, Optional, Protocol, Type
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from typing_extensions import TypeVar

View File

@ -1,12 +1,12 @@
import copy
import logging
from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional, Type
from typing import List, Optional, Type
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label

View File

@ -1,7 +1,8 @@
import re
from collections import Counter
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union
from typing import List, Literal, Optional, Tuple, Union
import numpy as np
from docling_core.types.doc import (

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union
from typing import List, Literal, Optional, Union
import numpy as np
from docling_core.types.doc import (

View File

@ -1,8 +1,9 @@
import logging
import warnings
import zipfile
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional, Type
from typing import List, Optional, Type
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -98,8 +99,10 @@ class EasyOcrModel(BaseOcrModel):
progress: bool = False,
) -> Path:
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
from easyocr.config import detection_models as det_models_dict
from easyocr.config import recognition_models as rec_models_dict
from easyocr.config import (
detection_models as det_models_dict,
recognition_models as rec_models_dict,
)
if local_dir is None:
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder

View File

@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
logger = logging.getLogger(__name__)
@lru_cache()
@lru_cache
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
factory = OcrFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
return factory
@lru_cache()
@lru_cache
def get_picture_description_factory(
allow_external_plugins: bool = False,
) -> PictureDescriptionFactory:

View File

@ -1,18 +1,16 @@
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional
from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -1,16 +1,15 @@
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional
from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
@ -41,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
device = decide_device(accelerator_options.device)
self.device = device
_log.debug("Available device for HuggingFace VLM: {}".format(device))
_log.debug(f"Available device for HuggingFace VLM: {device}")
repo_cache_folder = vlm_options.repo_id.replace("/", "--")

View File

@ -1,8 +1,9 @@
import copy
import logging
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Optional
from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor

View File

@ -1,8 +1,9 @@
import logging
import sys
import tempfile
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Tuple, Type
from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
if self.enabled:
if "darwin" != sys.platform:
raise RuntimeError(f"OcrMac is only supported on Mac.")
raise RuntimeError("OcrMac is only supported on Mac.")
install_errmsg = (
"ocrmac is not correctly installed. "
"Please install it via `pip install ocrmac` to use this OCR engine. "

View File

@ -1,6 +1,7 @@
import logging
import re
from typing import Iterable, List
from collections.abc import Iterable
from typing import List
from pydantic import BaseModel
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
sanitized_text = "".join(lines)
# Text normalization
sanitized_text = sanitized_text.replace("", "/")
sanitized_text = sanitized_text.replace("", "'")
sanitized_text = sanitized_text.replace("", "'")
sanitized_text = sanitized_text.replace("", "/") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", "·")

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional
from typing import Optional
from PIL import ImageDraw
from pydantic import BaseModel

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from typing import Optional, Type, Union
from PIL import Image

View File

@ -1,12 +1,11 @@
import logging
from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Iterable, List, Optional, Type, Union
from typing import List, Optional, Type, Union
from docling_core.types.doc import (
DoclingDocument,
NodeItem,
PictureClassificationClass,
PictureItem,
)
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from typing import Optional, Type, Union
from PIL import Image

View File

@ -1,6 +1,7 @@
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type
from typing import Optional, Type
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin

View File

@ -1,12 +1,7 @@
import copy
import random
from pathlib import Path
from typing import Dict, List
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
@ -17,13 +12,10 @@ from docling_core.types.doc import (
TableData,
)
from docling_core.types.doc.document import ContentLayer
from docling_core.types.legacy_doc.base import Ref
from docling_core.types.legacy_doc.document import BaseText
from docling_ibm_models.reading_order.reading_order_rb import (
PageElement as ReadingOrderPageElement,
ReadingOrderPredictor,
)
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import (
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
TextElement,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.utils.profiling import ProfilingScope, TimeRecorder

View File

@ -1,13 +1,13 @@
import copy
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Optional
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_core.types.doc.page import (
BoundingRectangle,
SegmentedPdfPage,
TextCellUnit,
)
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor

View File

@ -3,9 +3,10 @@ import io
import logging
import os
import tempfile
from collections.abc import Iterable
from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, List, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin

View File

@ -1,6 +1,7 @@
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type
from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell

View File

@ -3,9 +3,10 @@ import logging
import time
import traceback
from abc import ABC, abstractmethod
from typing import Any, Callable, Iterable, List
from collections.abc import Iterable
from typing import Any, Callable, List
from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc import NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
total_elapsed_time = 0.0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
for i in range(0, conv_res.input.page_count):
for i in range(conv_res.input.page_count):
start_page, end_page = conv_res.input.limits.page_range
if (start_page - 1) <= i <= (end_page - 1):
conv_res.pages.append(Page(page_no=i))

View File

@ -1,5 +1,4 @@
import logging
import sys
import warnings
from pathlib import Path
from typing import Optional, cast

View File

@ -1,5 +1,4 @@
import logging
import warnings
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Union, cast

View File

@ -1,8 +1,8 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from collections.abc import Iterable
from typing import Any, Dict, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.document import ConversionResult, Page

View File

@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
return unique_objects
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
origin = DocumentOrigin(
mimetype="application/pdf",
filename=doc_glm["file-info"]["filename"],

View File

@ -18,7 +18,7 @@ class UnionFind:
def __init__(self, elements):
self.parent = {elem: elem for elem in elements}
self.rank = {elem: 0 for elem in elements}
self.rank = dict.fromkeys(elements, 0)
def find(self, x):
if self.parent[x] != x:

View File

@ -37,7 +37,7 @@ def download_models(
output_dir.mkdir(exist_ok=True, parents=True)
if with_layout:
_log.info(f"Downloading layout model...")
_log.info("Downloading layout model...")
LayoutModel.download_models(
local_dir=output_dir / LayoutModel._model_repo_folder,
force=force,
@ -45,7 +45,7 @@ def download_models(
)
if with_tableformer:
_log.info(f"Downloading tableformer model...")
_log.info("Downloading tableformer model...")
TableStructureModel.download_models(
local_dir=output_dir / TableStructureModel._model_repo_folder,
force=force,
@ -53,7 +53,7 @@ def download_models(
)
if with_picture_classifier:
_log.info(f"Downloading picture classifier model...")
_log.info("Downloading picture classifier model...")
DocumentPictureClassifier.download_models(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
force=force,
@ -61,7 +61,7 @@ def download_models(
)
if with_code_formula:
_log.info(f"Downloading code formula model...")
_log.info("Downloading code formula model...")
CodeFormulaModel.download_models(
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
force=force,
@ -69,7 +69,7 @@ def download_models(
)
if with_smolvlm:
_log.info(f"Downloading SmolVlm model...")
_log.info("Downloading SmolVlm model...")
PictureDescriptionVlmModel.download_models(
repo_id=smolvlm_picture_description.repo_id,
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@ -78,7 +78,7 @@ def download_models(
)
if with_granite_vision:
_log.info(f"Downloading Granite Vision model...")
_log.info("Downloading Granite Vision model...")
PictureDescriptionVlmModel.download_models(
repo_id=granite_picture_description.repo_id,
local_dir=output_dir / granite_picture_description.repo_cache_folder,
@ -87,7 +87,7 @@ def download_models(
)
if with_easyocr:
_log.info(f"Downloading easyocr models...")
_log.info("Downloading easyocr models...")
EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder,
force=force,

View File

@ -383,7 +383,7 @@
"\n",
"print(f\"Downloading {url}...\")\n",
"buf = BytesIO(requests.get(url).content)\n",
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"with zipfile.ZipFile(buf) as zf:\n",
" res = zf.testzip()\n",
" if res:\n",

View File

@ -1,8 +1,8 @@
import json
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable
import yaml
from docling_core.types.doc import ImageRefMode
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)

View File

@ -3,7 +3,6 @@ import logging
import time
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__)

View File

@ -3,8 +3,8 @@
# It does not run the actual formula understanding model.
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem

View File

@ -3,8 +3,9 @@
# It does not run the actual picture classifier model.
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Iterable
from typing import Any
from docling_core.types.doc import (
DoclingDocument,

View File

@ -4,7 +4,7 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -1,14 +1,9 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -153,10 +153,10 @@
"source": [
"for i, chunk in enumerate(chunk_iter):\n",
" print(f\"=== {i} ===\")\n",
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
"\n",
" enriched_text = chunker.serialize(chunk=chunk)\n",
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
"\n",
" print()"
]
@ -353,11 +353,11 @@
"for i, chunk in enumerate(chunks):\n",
" print(f\"=== {i} ===\")\n",
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
"\n",
" ser_txt = chunker.serialize(chunk=chunk)\n",
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
"\n",
" print()"
]

View File

@ -2,17 +2,11 @@ import json
import time
from pathlib import Path
import yaml
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
VlmPipelineOptions,
granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
for source in sources:
start_time = time.time()
print("================================================")
print("Processing... {}".format(source))
print(f"Processing... {source}")
print("================================================")
print("")
@ -77,7 +71,7 @@ for source in sources:
print(page.predictions.vlm_response.text)
res.document.save_as_html(
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
image_mode=ImageRefMode.REFERENCED,
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
)

View File

@ -144,7 +144,7 @@
"for pic in doc.pictures[:5]:\n",
" html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n",
" for annotation in pic.annotations:\n",
@ -252,7 +252,7 @@
"for pic in doc.pictures[:5]:\n",
" html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n",
" for annotation in pic.annotations:\n",

View File

@ -351,7 +351,7 @@
"for source in sources:\n",
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
" print(f\"- text: {doc_chunk.text!r}\")\n",
" if doc_chunk.meta.origin:\n",
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
" if doc_chunk.meta.headings:\n",

View File

@ -119,7 +119,7 @@
" device = torch.device(\"mps\")\n",
" print(\"MPS GPU is enabled.\")\n",
"else:\n",
" raise EnvironmentError(\n",
" raise OSError(\n",
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
" )"
]
@ -226,7 +226,6 @@
}
],
"source": [
"from docling.datamodel.document import ConversionResult\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"# Instantiate the doc converter\n",
@ -345,7 +344,7 @@
"\n",
" openai_api_key = os.getenv(openai_api_key_var)\n",
" if not openai_api_key:\n",
" raise EnvironmentError(\n",
" raise OSError(\n",
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
" \"Please define it before running this script.\"\n",
" )"
@ -387,7 +386,6 @@
"outputs": [],
"source": [
"import weaviate.classes.config as wc\n",
"from weaviate.classes.config import DataType, Property\n",
"\n",
"# Define the collection name\n",
"collection_name = \"docling\"\n",

View File

@ -25,7 +25,7 @@ def main():
document = mdb.convert()
out_path = Path("scratch")
print(f"Document {path} converted.\nSaved markdown output to: {str(out_path)}")
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
# Export Docling document format to markdowndoc:
fn = os.path.basename(path)

View File

@ -1,13 +1,10 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -63,7 +63,7 @@ def main():
out_path = Path("scratch")
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}"
f"\nSaved markdown output to: {out_path!s}"
)
_log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc:

View File

@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -2,9 +2,9 @@ import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
ApiVlmOptions,
ResponseFormat,
VlmPipelineOptions,
granite_vision_vlm_ollama_conversion_options,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

View File

@ -202,12 +202,16 @@ select = [
]
ignore = [
"C408", # Unnecessary `dict()` call (rewrite as a literal)
"E501", # Line too long, handled by ruff formatter
"D107", # "Missing docstring in __init__",
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
"F811", # "redefinition of the same function"
"PL", # Pylint
"RUF012", # Mutable Class Attributes
"UP006", # List vs list, etc
"UP007", # Option and Union
"UP035", # `typing.Set` is deprecated, use `set` instead"
]
#extend-select = []
@ -217,7 +221,7 @@ ignore = [
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
[tool.ruff.lint.mccabe]
max-complexity = 15
max-complexity = 20
# [tool.ruff.lint.isort.sections]
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]

View File

@ -37,7 +37,7 @@ def test_asciidocs_examples():
print("\n\n", pred_mddoc)
if os.path.exists(gname):
with open(gname, "r") as fr:
with open(gname) as fr:
true_mddoc = fr.read()
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"

View File

@ -1,5 +1,3 @@
import json
import os
from pathlib import Path
from pytest import warns
@ -16,7 +14,7 @@ GENERATE = GEN_TEST_DATA
def get_csv_paths():
# Define the directory you want to search
directory = Path(f"./tests/data/csv/")
directory = Path("./tests/data/csv/")
# List all CSV files in the directory and its subdirectories
return sorted(directory.rglob("*.csv"))

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)

View File

@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
def get_pubmed_paths():
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
xml_files = sorted(directory.rglob("*.xml"))
return xml_files

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend

View File

@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
assert isinstance(texts[2], TextItem)
assert texts[2].text == (
"An interleaver receives incoming data frames of size N. The interleaver "
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
"then effectively rearranges (permutes) the data by permuting the rows of the "
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
"equal to N₂, βj is a constant which may be different for each row, and each "
"αj is a relative prime number relative to P. After permuting, the "
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
"interleaver outputs the data in a different order than received (e.g., "
"receives sequentially row by row, outputs sequentially each column by column)."
)

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path
from docling.datamodel.base_models import InputFormat

View File

@ -3,7 +3,6 @@ from pathlib import Path
from docling_core.types.doc import CodeItem, TextItem
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -2,7 +2,6 @@ from pathlib import Path
from docling_core.types.doc import PictureClassificationData
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -1,7 +1,6 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions

View File

@ -3,7 +3,6 @@ from pathlib import Path
from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (

View File

@ -4,7 +4,6 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -3,8 +3,6 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -216,7 +216,7 @@ def verify_picture_image_v2(
def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool):
assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match."
assert len(doc_pred.texts) == len(doc_true.texts), "Text lengths do not match."
assert len(doc_true.tables) == len(doc_pred.tables), (
"document has different count of tables than expected."
@ -230,7 +230,7 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
assert isinstance(pred_item, DocItem), "Test item is not a DocItem"
# Validate type
assert true_item.label == pred_item.label, f"Object label does not match."
assert true_item.label == pred_item.label, "Object label does not match."
# Validate provenance
assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch"
@ -337,16 +337,16 @@ def verify_conversion_result_v1(
with open(dt_path, "w") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
with open(pages_path, "r") as fr:
with open(pages_path) as fr:
doc_true_pages = PageList.validate_json(fr.read())
with open(json_path, "r") as fr:
with open(json_path) as fr:
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
with open(md_path, "r") as fr:
with open(md_path) as fr:
doc_true_md = fr.read()
with open(dt_path, "r") as fr:
with open(dt_path) as fr:
doc_true_dt = fr.read()
if not fuzzy:
@ -419,16 +419,16 @@ def verify_conversion_result_v2(
with open(dt_path, "w") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
with open(pages_path, "r") as fr:
with open(pages_path) as fr:
doc_true_pages = PageList.validate_json(fr.read())
with open(json_path, "r") as fr:
with open(json_path) as fr:
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
with open(md_path, "r") as fr:
with open(md_path) as fr:
doc_true_md = fr.read()
with open(dt_path, "r") as fr:
with open(dt_path) as fr:
doc_true_dt = fr.read()
if not fuzzy: