apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-04-14 14:48:04 +02:00
parent d74e407526
commit 73cec158c6
84 changed files with 172 additions and 225 deletions

View File

@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_stream = self.path_or_stream.getvalue().decode("utf-8") text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.lines = text_stream.split("\n") self.lines = text_stream.split("\n")
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, encoding="utf-8") as f:
self.lines = f.readlines() self.lines = f.readlines()
self.valid = True self.valid = True
@ -75,7 +75,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return doc return doc
def _parse(self, doc: DoclingDocument): def _parse(self, doc: DoclingDocument): # noqa: C901
""" """
Main function that orchestrates the parsing by yielding components: Main function that orchestrates the parsing by yielding components:
title, section headers, text, lists, and tables. title, section headers, text, lists, and tables.
@ -95,7 +95,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# indents: dict[int, Union[DocItem, GroupItem, None]] = {} # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {} indents: dict[int, Union[GroupItem, None]] = {}
for i in range(0, 10): for i in range(10):
parents[i] = None parents[i] = None
indents[i] = None indents[i] = None

View File

@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
head = self.content.readline() head = self.content.readline()
dialect = csv.Sniffer().sniff(head, ",;\t|:") dialect = csv.Sniffer().sniff(head, ",;\t|:")
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|", ":"}: if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
raise RuntimeError( raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}." f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
) )

View File

@ -1,8 +1,9 @@
import logging import logging
import random import random
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Union from typing import List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size

View File

@ -1,8 +1,9 @@
import logging import logging
import random import random
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin

View File

@ -1,14 +1,14 @@
import logging import logging
import random from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union from typing import TYPE_CHECKING, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from PIL import Image, ImageDraw from PIL import Image
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend

View File

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*-
""" """
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
On 23/01/2025 On 23/01/2025
""" """
from __future__ import unicode_literals
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
BLANK = "" BLANK = ""
@ -79,7 +75,6 @@ CHR_BO = {
} }
T = { T = {
"\u2192": "\\rightarrow ",
# Greek letters # Greek letters
"\U0001d6fc": "\\alpha ", "\U0001d6fc": "\\alpha ",
"\U0001d6fd": "\\beta ", "\U0001d6fd": "\\beta ",

View File

@ -76,7 +76,7 @@ def get_val(key, default=None, store=CHR):
return default return default
class Tag2Method(object): class Tag2Method:
def call_method(self, elm, stag=None): def call_method(self, elm, stag=None):
getmethod = self.tag2meth.get getmethod = self.tag2meth.get
if stag is None: if stag is None:
@ -157,7 +157,7 @@ class Pr(Tag2Method):
def do_common(self, elm): def do_common(self, elm):
stag = elm.tag.replace(OMML_NS, "") stag = elm.tag.replace(OMML_NS, "")
if stag in self.__val_tags: if stag in self.__val_tags:
t = elm.get("{0}val".format(OMML_NS)) t = elm.get(f"{OMML_NS}val")
self.__innerdict[stag] = t self.__innerdict[stag] = t
return None return None
@ -246,7 +246,6 @@ class oMath2Latex(Tag2Method):
""" """
the Pre-Sub-Superscript object -- Not support yet the Pre-Sub-Superscript object -- Not support yet
""" """
pass
def do_sub(self, elm): def do_sub(self, elm):
text = self.process_children(elm) text = self.process_children(elm)
@ -329,7 +328,7 @@ class oMath2Latex(Tag2Method):
t_dict = self.process_children_dict(elm, include=("e", "lim")) t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"]) latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s: if not latex_s:
raise NotSupport("Not support lim %s" % t_dict["e"]) raise RuntimeError("Not support lim %s" % t_dict["e"])
else: else:
return latex_s.format(lim=t_dict.get("lim")) return latex_s.format(lim=t_dict.get("lim"))
@ -411,7 +410,7 @@ class oMath2Latex(Tag2Method):
""" """
_str = [] _str = []
_base_str = [] _base_str = []
found_text = elm.findtext("./{0}t".format(OMML_NS)) found_text = elm.findtext(f"./{OMML_NS}t")
if found_text: if found_text:
for s in found_text: for s in found_text:
out_latex_str = self.process_unicode(s) out_latex_str = self.process_unicode(s)

View File

@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.max_levels = 10 self.max_levels = 10
self.level = 0 self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
for i in range(0, self.max_levels): for i in range(self.max_levels):
self.parents[i] = None self.parents[i] = None
try: try:
@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc) self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child: except Exception as exc_child:
_log.error( _log.error(
f"Error processing child from tag {tag.name}: {repr(exc_child)}" f"Error processing child from tag {tag.name}: {exc_child!r}"
) )
raise exc_child raise exc_child
elif isinstance(element, NavigableString) and not isinstance( elif isinstance(element, NavigableString) and not isinstance(
@ -357,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = "" marker = ""
enumerated = False enumerated = False
if parent_label == GroupLabel.ORDERED_LIST: if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}." marker = f"{index_in_list!s}."
enumerated = True enumerated = True
doc.add_list_item( doc.add_list_item(
text=text, text=text,

View File

@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# otherwise they represent emphasis (bold or italic) # otherwise they represent emphasis (bold or italic)
self.markdown = self._shorten_underscore_sequences(text_stream) self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, encoding="utf-8") as f:
md_content = f.read() md_content = f.read()
# remove invalid sequences # remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times. # very long sequences of underscores will lead to unnecessary long processing times.
@ -235,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if has_non_empty_list_items: if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_item = doc.add_group( parent_item = doc.add_group(
label=label, name=f"list", parent=parent_item label=label, name="list", parent=parent_item
) )
elif ( elif (
@ -319,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._html_blocks += 1 self._html_blocks += 1
self._process_inline_text(parent_item, doc) self._process_inline_text(parent_item, doc)
self._close_table(doc) self._close_table(doc)
_log.debug("HTML Block: {}".format(element)) _log.debug(f"HTML Block: {element}")
if ( if (
len(element.body) > 0 len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it ): # If Marko doesn't return any content for HTML block, skip it
@ -331,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else: else:
if not isinstance(element, str): if not isinstance(element, str):
self._close_table(doc) self._close_table(doc)
_log.debug("Some other element: {}".format(element)) _log.debug(f"Some other element: {element}")
processed_block_types = ( processed_block_types = (
marko.block.Heading, marko.block.Heading,

View File

@ -120,7 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
is_a_list = False is_a_list = False
is_list_group_created = False is_list_group_created = False
enum_list_item_value = 0 enum_list_item_value = 0
@ -243,7 +243,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enum_marker = str(enum_list_item_value) + "." enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created: if not is_list_group_created:
new_list = doc.add_group( new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide label=list_label, name="list", parent=parent_slide
) )
is_list_group_created = True is_list_group_created = True
doc.add_list_item( doc.add_list_item(
@ -372,7 +372,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
max_levels = 10 max_levels = 10
parents = {} # type: ignore parents = {} # type: ignore
for i in range(0, max_levels): for i in range(max_levels):
parents[i] = None parents[i] = None
# Loop through each slide # Loop through each slide

View File

@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
) )
if cell is None or cell._tc in cell_set: if cell is None or cell._tc in cell_set:
_log.debug(f" skipped since repeated content") _log.debug(" skipped since repeated content")
col_idx += cell.grid_span col_idx += cell.grid_span
continue continue
else: else:
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image=ImageRef.from_pil(image=pil_image, dpi=72), image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None, caption=None,
) )
except (UnidentifiedImageError, OSError) as e: except (UnidentifiedImageError, OSError):
_log.warning("Warning: image cannot be loaded by Pillow") _log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture( doc.add_picture(
parent=self.parents[level - 1], parent=self.parents[level - 1],

View File

@ -1,7 +1,8 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Set, Union from typing import Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.doc.page import SegmentedPdfPage, TextCell

View File

@ -1,8 +1,9 @@
import logging import logging
import random import random
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
self.valid = True # No better way to tell from pypdfium. self.valid = True # No better way to tell from pypdfium.
try: try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no] self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e: except PdfiumError:
_log.info( _log.info(
f"An exception occurred when loading page {page_no} of document {document_hash}.", f"An exception occurred when loading page {page_no} of document {document_hash}.",
exc_info=True, exc_info=True,

View File

@ -348,7 +348,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
return return
def _parse_element_citation(self, node: etree._Element) -> str: def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
citation: Citation = { citation: Citation = {
"author_names": "", "author_names": "",
"title": "", "title": "",
@ -439,7 +439,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip() citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0: if len(node.xpath("lpage")) > 0:
citation["page"] += ( citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip() "" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
) )
# Flatten the citation to string # Flatten the citation to string
@ -594,9 +594,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
try: try:
self._add_table(doc, parent, table) self._add_table(doc, parent, table)
except Exception as e: except Exception:
_log.warning(f"Skipping unsupported table in {str(self.file)}") _log.warning(f"Skipping unsupported table in {self.file!s}")
pass
return return

View File

@ -162,7 +162,6 @@ class PatentUspto(ABC):
Returns: Returns:
The patent parsed as a docling document. The patent parsed as a docling document.
""" """
pass
class PatentUsptoIce(PatentUspto): class PatentUsptoIce(PatentUspto):
@ -264,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
self.style_html = HtmlEntity() self.style_html = HtmlEntity()
@override @override
def startElement(self, tag, attributes): # noqa: N802 def startElement(self, tag, attributes):
"""Signal the start of an element. """Signal the start of an element.
Args: Args:
@ -280,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
self._start_registered_elements(tag, attributes) self._start_registered_elements(tag, attributes)
@override @override
def skippedEntity(self, name): # noqa: N802 def skippedEntity(self, name):
"""Receive notification of a skipped entity. """Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them HTML entities will be skipped by the parser. This method will unescape them
@ -314,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
self.text += unescaped self.text += unescaped
@override @override
def endElement(self, tag): # noqa: N802 def endElement(self, tag):
"""Signal the end of an element. """Signal the end of an element.
Args: Args:
@ -602,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.style_html = HtmlEntity() self.style_html = HtmlEntity()
@override @override
def startElement(self, tag, attributes): # noqa: N802 def startElement(self, tag, attributes):
"""Signal the start of an element. """Signal the start of an element.
Args: Args:
@ -615,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
self._start_registered_elements(tag, attributes) self._start_registered_elements(tag, attributes)
@override @override
def skippedEntity(self, name): # noqa: N802 def skippedEntity(self, name):
"""Receive notification of a skipped entity. """Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them HTML entities will be skipped by the parser. This method will unescape them
@ -649,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.text += unescaped self.text += unescaped
@override @override
def endElement(self, tag): # noqa: N802 def endElement(self, tag):
"""Signal the end of an element. """Signal the end of an element.
Args: Args:
@ -690,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
if tag in [member.value for member in self.Element]: if tag in [member.value for member in self.Element]:
if ( if (
tag == self.Element.HEADING.value tag == self.Element.HEADING.value
and not self.Element.SDOCL.value in self.property and self.Element.SDOCL.value not in self.property
): ):
level_attr: str = attributes.get("LVL", "") level_attr: str = attributes.get("LVL", "")
new_level: int = int(level_attr) if level_attr.isnumeric() else 1 new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@ -742,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
# headers except claims statement # headers except claims statement
elif ( elif (
self.Element.HEADING.value in self.property self.Element.HEADING.value in self.property
and not self.Element.SDOCL.value in self.property and self.Element.SDOCL.value not in self.property
and text.strip() and text.strip()
): ):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
@ -1163,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
self.style_html = HtmlEntity() self.style_html = HtmlEntity()
@override @override
def startElement(self, tag, attributes): # noqa: N802 def startElement(self, tag, attributes):
"""Signal the start of an element. """Signal the start of an element.
Args: Args:
@ -1176,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
self._start_registered_elements(tag, attributes) self._start_registered_elements(tag, attributes)
@override @override
def skippedEntity(self, name): # noqa: N802 def skippedEntity(self, name):
"""Receive notification of a skipped entity. """Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them HTML entities will be skipped by the parser. This method will unescape them
@ -1210,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
self.text += unescaped self.text += unescaped
@override @override
def endElement(self, tag): # noqa: N802 def endElement(self, tag):
"""Signal the end of an element. """Signal the end of an element.
Args: Args:
@ -1526,7 +1525,7 @@ class XmlTable:
return ncols_max return ncols_max
def _parse_table(self, table: Tag) -> TableData: def _parse_table(self, table: Tag) -> TableData: # noqa: C901
"""Parse the content of a table tag. """Parse the content of a table tag.
Args: Args:
@ -1721,7 +1720,7 @@ class HtmlEntity:
"0": "&#8304;", "0": "&#8304;",
"+": "&#8314;", "+": "&#8314;",
"-": "&#8315;", "-": "&#8315;",
"": "&#8315;", "": "&#8315;", # noqa: RUF001
"=": "&#8316;", "=": "&#8316;",
"(": "&#8317;", "(": "&#8317;",
")": "&#8318;", ")": "&#8318;",
@ -1745,7 +1744,7 @@ class HtmlEntity:
"0": "&#8320;", "0": "&#8320;",
"+": "&#8330;", "+": "&#8330;",
"-": "&#8331;", "-": "&#8331;",
"": "&#8331;", "": "&#8331;", # noqa: RUF001
"=": "&#8332;", "=": "&#8332;",
"(": "&#8333;", "(": "&#8333;",
")": "&#8334;", ")": "&#8334;",

View File

@ -6,8 +6,9 @@ import sys
import tempfile import tempfile
import time import time
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type from typing import Annotated, Dict, List, Optional, Type
import rich.table import rich.table
import typer import typer
@ -288,7 +289,7 @@ def convert(
..., ...,
help=( help=(
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: " f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. " f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
f"Use the option --show-external-plugins to see the options allowed with external plugins." f"Use the option --show-external-plugins to see the options allowed with external plugins."
), ),
), ),

View File

@ -62,7 +62,7 @@ def download(
models: Annotated[ models: Annotated[
Optional[list[_AvailableModels]], Optional[list[_AvailableModels]],
typer.Argument( typer.Argument(
help=f"Models to download (default behavior: a predefined set of models will be downloaded).", help="Models to download (default behavior: a predefined set of models will be downloaded).",
), ),
] = None, ] = None,
all: Annotated[ all: Annotated[

View File

@ -10,7 +10,7 @@ from docling_core.types.doc import (
TableCell, TableCell,
) )
from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location from docling_core.types.io import (
DocumentStream, DocumentStream,
) )
from PIL.Image import Image from PIL.Image import Image
@ -243,7 +243,7 @@ class Page(BaseModel):
if self._backend is None: if self._backend is None:
return self._image_cache.get(scale, None) return self._image_cache.get(scale, None)
if not scale in self._image_cache: if scale not in self._image_cache:
if cropbox is None: if cropbox is None:
self._image_cache[scale] = self._backend.get_page_image(scale=scale) self._image_cache[scale] = self._backend.get_page_image(scale=scale)
else: else:

View File

@ -1,13 +1,13 @@
import csv import csv
import logging import logging
import re import re
from collections.abc import Iterable
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Dict, Dict,
Iterable,
List, List,
Literal, Literal,
Optional, Optional,
@ -18,31 +18,9 @@ from typing import (
import filetype import filetype
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
PictureItem,
SectionHeaderItem,
TableItem,
TextItem,
) )
from docling_core.types.doc.document import ListItem
from docling_core.types.legacy_doc.base import (
BaseText,
Figure,
GlmTableCell,
PageDimensions,
PageReference,
Prov,
Ref,
)
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
from docling_core.types.legacy_doc.base import TableCell
from docling_core.types.legacy_doc.document import (
CCSDocumentDescription as DsDocumentDescription,
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel from pydantic import BaseModel
@ -65,7 +43,7 @@ from docling.datamodel.base_models import (
) )
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash from docling.utils.utils import create_file_hash
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.document_converter import FormatOption from docling.document_converter import FormatOption

View File

@ -1,11 +1,11 @@
import hashlib import hashlib
import logging import logging
import math
import sys import sys
import time import time
from collections.abc import Iterable, Iterator
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union from typing import Dict, List, Optional, Tuple, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@ -254,7 +254,7 @@ class DocumentConverter:
if not had_result and raises_on_error: if not had_result and raises_on_error:
raise ConversionError( raise ConversionError(
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
) )
def _convert( def _convert(
@ -266,7 +266,7 @@ class DocumentConverter:
conv_input.docs(self.format_to_options), conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options settings.perf.doc_batch_size, # pass format_options
): ):
_log.info(f"Going to convert document batch...") _log.info("Going to convert document batch...")
# parallel processing only within input_batch # parallel processing only within input_batch
# with ThreadPoolExecutor( # with ThreadPoolExecutor(

View File

@ -1,4 +1,4 @@
from typing import Iterable from collections.abc import Iterable
from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult

View File

@ -1,5 +1,6 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional, Protocol, Type from collections.abc import Iterable
from typing import Generic, Optional, Protocol, Type
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from typing_extensions import TypeVar from typing_extensions import TypeVar

View File

@ -1,12 +1,12 @@
import copy import copy
import logging import logging
from abc import abstractmethod from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Type from typing import List, Optional, Type
import numpy as np import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from rtree import index from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label from scipy.ndimage import binary_dilation, find_objects, label

View File

@ -1,7 +1,8 @@
import re import re
from collections import Counter from collections import Counter
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union from typing import List, Literal, Optional, Tuple, Union
import numpy as np import numpy as np
from docling_core.types.doc import ( from docling_core.types.doc import (

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union from typing import List, Literal, Optional, Union
import numpy as np import numpy as np
from docling_core.types.doc import ( from docling_core.types.doc import (

View File

@ -1,8 +1,9 @@
import logging import logging
import warnings import warnings
import zipfile import zipfile
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Type from typing import List, Optional, Type
import numpy import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -98,8 +99,10 @@ class EasyOcrModel(BaseOcrModel):
progress: bool = False, progress: bool = False,
) -> Path: ) -> Path:
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
from easyocr.config import detection_models as det_models_dict from easyocr.config import (
from easyocr.config import recognition_models as rec_models_dict detection_models as det_models_dict,
recognition_models as rec_models_dict,
)
if local_dir is None: if local_dir is None:
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder

View File

@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@lru_cache() @lru_cache
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory: def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
factory = OcrFactory() factory = OcrFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins) factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
return factory return factory
@lru_cache() @lru_cache
def get_picture_description_factory( def get_picture_description_factory(
allow_external_plugins: bool = False, allow_external_plugins: bool = False,
) -> PictureDescriptionFactory: ) -> PictureDescriptionFactory:

View File

@ -1,18 +1,16 @@
import logging import logging
import time import time
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
HuggingFaceVlmOptions, HuggingFaceVlmOptions,
) )
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -1,16 +1,15 @@
import logging import logging
import time import time
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
HuggingFaceVlmOptions, HuggingFaceVlmOptions,
) )
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
@ -41,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
device = decide_device(accelerator_options.device) device = decide_device(accelerator_options.device)
self.device = device self.device = device
_log.debug("Available device for HuggingFace VLM: {}".format(device)) _log.debug(f"Available device for HuggingFace VLM: {device}")
repo_cache_folder = vlm_options.repo_id.replace("/", "--") repo_cache_folder = vlm_options.repo_id.replace("/", "--")

View File

@ -1,8 +1,9 @@
import copy import copy
import logging import logging
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Union from typing import Optional
from docling_core.types.doc import DocItemLabel from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor

View File

@ -1,8 +1,9 @@
import logging import logging
import sys import sys
import tempfile import tempfile
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Tuple, Type from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
if self.enabled: if self.enabled:
if "darwin" != sys.platform: if "darwin" != sys.platform:
raise RuntimeError(f"OcrMac is only supported on Mac.") raise RuntimeError("OcrMac is only supported on Mac.")
install_errmsg = ( install_errmsg = (
"ocrmac is not correctly installed. " "ocrmac is not correctly installed. "
"Please install it via `pip install ocrmac` to use this OCR engine. " "Please install it via `pip install ocrmac` to use this OCR engine. "

View File

@ -1,6 +1,7 @@
import logging import logging
import re import re
from typing import Iterable, List from collections.abc import Iterable
from typing import List
from pydantic import BaseModel from pydantic import BaseModel
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
sanitized_text = "".join(lines) sanitized_text = "".join(lines)
# Text normalization # Text normalization
sanitized_text = sanitized_text.replace("", "/") sanitized_text = sanitized_text.replace("", "/") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", '"') sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", '"') sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", "·") sanitized_text = sanitized_text.replace("", "·")

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional from typing import Optional
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type, Union from typing import Optional, Type, Union
from PIL import Image from PIL import Image

View File

@ -1,12 +1,11 @@
import logging
from abc import abstractmethod from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Any, Iterable, List, Optional, Type, Union from typing import List, Optional, Type, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
DoclingDocument, DoclingDocument,
NodeItem, NodeItem,
PictureClassificationClass,
PictureItem, PictureItem,
) )
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type, Union from typing import Optional, Type, Union
from PIL import Image from PIL import Image

View File

@ -1,6 +1,7 @@
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type from typing import Optional, Type
import numpy import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin

View File

@ -1,12 +1,7 @@
import copy
import random
from pathlib import Path from pathlib import Path
from typing import Dict, List from typing import Dict, List
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
@ -17,13 +12,10 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from docling_core.types.doc.document import ContentLayer from docling_core.types.doc.document import ContentLayer
from docling_core.types.legacy_doc.base import Ref
from docling_core.types.legacy_doc.document import BaseText
from docling_ibm_models.reading_order.reading_order_rb import ( from docling_ibm_models.reading_order.reading_order_rb import (
PageElement as ReadingOrderPageElement, PageElement as ReadingOrderPageElement,
ReadingOrderPredictor,
) )
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
TextElement, TextElement,
) )
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder

View File

@ -1,13 +1,13 @@
import copy import copy
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Union from typing import Optional
import numpy import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_core.types.doc.page import ( from docling_core.types.doc.page import (
BoundingRectangle, BoundingRectangle,
SegmentedPdfPage,
TextCellUnit, TextCellUnit,
) )
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor

View File

@ -3,9 +3,10 @@ import io
import logging import logging
import os import os
import tempfile import tempfile
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pandas as pd import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin

View File

@ -1,6 +1,7 @@
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell from docling_core.types.doc.page import BoundingRectangle, TextCell

View File

@ -3,9 +3,10 @@ import logging
import time import time
import traceback import traceback
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Callable, Iterable, List from collections.abc import Iterable
from typing import Any, Callable, List
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
total_elapsed_time = 0.0 total_elapsed_time = 0.0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
for i in range(0, conv_res.input.page_count): for i in range(conv_res.input.page_count):
start_page, end_page = conv_res.input.limits.page_range start_page, end_page = conv_res.input.limits.page_range
if (start_page - 1) <= i <= (end_page - 1): if (start_page - 1) <= i <= (end_page - 1):
conv_res.pages.append(Page(page_no=i)) conv_res.pages.append(Page(page_no=i))

View File

@ -1,5 +1,4 @@
import logging import logging
import sys
import warnings import warnings
from pathlib import Path from pathlib import Path
from typing import Optional, cast from typing import Optional, cast

View File

@ -1,5 +1,4 @@
import logging import logging
import warnings
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import List, Optional, Union, cast from typing import List, Optional, Union, cast

View File

@ -1,8 +1,8 @@
import logging import logging
from typing import Any, Dict, Iterable, List, Tuple, Union from collections.abc import Iterable
from typing import Any, Dict, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.document import ConversionResult, Page from docling.datamodel.document import ConversionResult, Page

View File

@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
return unique_objects return unique_objects
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
origin = DocumentOrigin( origin = DocumentOrigin(
mimetype="application/pdf", mimetype="application/pdf",
filename=doc_glm["file-info"]["filename"], filename=doc_glm["file-info"]["filename"],

View File

@ -18,7 +18,7 @@ class UnionFind:
def __init__(self, elements): def __init__(self, elements):
self.parent = {elem: elem for elem in elements} self.parent = {elem: elem for elem in elements}
self.rank = {elem: 0 for elem in elements} self.rank = dict.fromkeys(elements, 0)
def find(self, x): def find(self, x):
if self.parent[x] != x: if self.parent[x] != x:

View File

@ -37,7 +37,7 @@ def download_models(
output_dir.mkdir(exist_ok=True, parents=True) output_dir.mkdir(exist_ok=True, parents=True)
if with_layout: if with_layout:
_log.info(f"Downloading layout model...") _log.info("Downloading layout model...")
LayoutModel.download_models( LayoutModel.download_models(
local_dir=output_dir / LayoutModel._model_repo_folder, local_dir=output_dir / LayoutModel._model_repo_folder,
force=force, force=force,
@ -45,7 +45,7 @@ def download_models(
) )
if with_tableformer: if with_tableformer:
_log.info(f"Downloading tableformer model...") _log.info("Downloading tableformer model...")
TableStructureModel.download_models( TableStructureModel.download_models(
local_dir=output_dir / TableStructureModel._model_repo_folder, local_dir=output_dir / TableStructureModel._model_repo_folder,
force=force, force=force,
@ -53,7 +53,7 @@ def download_models(
) )
if with_picture_classifier: if with_picture_classifier:
_log.info(f"Downloading picture classifier model...") _log.info("Downloading picture classifier model...")
DocumentPictureClassifier.download_models( DocumentPictureClassifier.download_models(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
force=force, force=force,
@ -61,7 +61,7 @@ def download_models(
) )
if with_code_formula: if with_code_formula:
_log.info(f"Downloading code formula model...") _log.info("Downloading code formula model...")
CodeFormulaModel.download_models( CodeFormulaModel.download_models(
local_dir=output_dir / CodeFormulaModel._model_repo_folder, local_dir=output_dir / CodeFormulaModel._model_repo_folder,
force=force, force=force,
@ -69,7 +69,7 @@ def download_models(
) )
if with_smolvlm: if with_smolvlm:
_log.info(f"Downloading SmolVlm model...") _log.info("Downloading SmolVlm model...")
PictureDescriptionVlmModel.download_models( PictureDescriptionVlmModel.download_models(
repo_id=smolvlm_picture_description.repo_id, repo_id=smolvlm_picture_description.repo_id,
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@ -78,7 +78,7 @@ def download_models(
) )
if with_granite_vision: if with_granite_vision:
_log.info(f"Downloading Granite Vision model...") _log.info("Downloading Granite Vision model...")
PictureDescriptionVlmModel.download_models( PictureDescriptionVlmModel.download_models(
repo_id=granite_picture_description.repo_id, repo_id=granite_picture_description.repo_id,
local_dir=output_dir / granite_picture_description.repo_cache_folder, local_dir=output_dir / granite_picture_description.repo_cache_folder,
@ -87,7 +87,7 @@ def download_models(
) )
if with_easyocr: if with_easyocr:
_log.info(f"Downloading easyocr models...") _log.info("Downloading easyocr models...")
EasyOcrModel.download_models( EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder, local_dir=output_dir / EasyOcrModel._model_repo_folder,
force=force, force=force,

View File

@ -383,7 +383,7 @@
"\n", "\n",
"print(f\"Downloading {url}...\")\n", "print(f\"Downloading {url}...\")\n",
"buf = BytesIO(requests.get(url).content)\n", "buf = BytesIO(requests.get(url).content)\n",
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n", "print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"with zipfile.ZipFile(buf) as zf:\n", "with zipfile.ZipFile(buf) as zf:\n",
" res = zf.testzip()\n", " res = zf.testzip()\n",
" if res:\n", " if res:\n",

View File

@ -1,8 +1,8 @@
import json import json
import logging import logging
import time import time
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable
import yaml import yaml
from docling_core.types.doc import ImageRefMode from docling_core.types.doc import ImageRefMode
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -3,7 +3,6 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -3,8 +3,8 @@
# It does not run the actual formula understanding model. # It does not run the actual formula understanding model.
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem

View File

@ -3,8 +3,9 @@
# It does not run the actual picture classifier model. # It does not run the actual picture classifier model.
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Any, Iterable from typing import Any
from docling_core.types.doc import ( from docling_core.types.doc import (
DoclingDocument, DoclingDocument,

View File

@ -4,7 +4,7 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -1,14 +1,9 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions, PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -153,10 +153,10 @@
"source": [ "source": [
"for i, chunk in enumerate(chunk_iter):\n", "for i, chunk in enumerate(chunk_iter):\n",
" print(f\"=== {i} ===\")\n", " print(f\"=== {i} ===\")\n",
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n", " print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
"\n", "\n",
" enriched_text = chunker.serialize(chunk=chunk)\n", " enriched_text = chunker.serialize(chunk=chunk)\n",
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n", " print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
"\n", "\n",
" print()" " print()"
] ]
@ -353,11 +353,11 @@
"for i, chunk in enumerate(chunks):\n", "for i, chunk in enumerate(chunks):\n",
" print(f\"=== {i} ===\")\n", " print(f\"=== {i} ===\")\n",
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n", " txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", " print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
"\n", "\n",
" ser_txt = chunker.serialize(chunk=chunk)\n", " ser_txt = chunker.serialize(chunk=chunk)\n",
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n", " ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n", " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
"\n", "\n",
" print()" " print()"
] ]

View File

@ -2,17 +2,11 @@ import json
import time import time
from pathlib import Path from pathlib import Path
import yaml
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice,
VlmPipelineOptions, VlmPipelineOptions,
granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options, smoldocling_vlm_mlx_conversion_options,
) )
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline from docling.pipeline.vlm_pipeline import VlmPipeline
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
for source in sources: for source in sources:
start_time = time.time() start_time = time.time()
print("================================================") print("================================================")
print("Processing... {}".format(source)) print(f"Processing... {source}")
print("================================================") print("================================================")
print("") print("")
@ -77,7 +71,7 @@ for source in sources:
print(page.predictions.vlm_response.text) print(page.predictions.vlm_response.text)
res.document.save_as_html( res.document.save_as_html(
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)), filename=Path(f"{out_path}/{res.input.file.stem}.html"),
image_mode=ImageRefMode.REFERENCED, image_mode=ImageRefMode.REFERENCED,
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
) )

View File

@ -144,7 +144,7 @@
"for pic in doc.pictures[:5]:\n", "for pic in doc.pictures[:5]:\n",
" html_item = (\n", " html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n", " f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n", " f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n", " f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n", " )\n",
" for annotation in pic.annotations:\n", " for annotation in pic.annotations:\n",
@ -252,7 +252,7 @@
"for pic in doc.pictures[:5]:\n", "for pic in doc.pictures[:5]:\n",
" html_item = (\n", " html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n", " f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n", " f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n", " f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n", " )\n",
" for annotation in pic.annotations:\n", " for annotation in pic.annotations:\n",

View File

@ -351,7 +351,7 @@
"for source in sources:\n", "for source in sources:\n",
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n", " if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n", " doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
" print(f\"- text: {repr(doc_chunk.text)}\")\n", " print(f\"- text: {doc_chunk.text!r}\")\n",
" if doc_chunk.meta.origin:\n", " if doc_chunk.meta.origin:\n",
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n", " print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
" if doc_chunk.meta.headings:\n", " if doc_chunk.meta.headings:\n",

View File

@ -119,7 +119,7 @@
" device = torch.device(\"mps\")\n", " device = torch.device(\"mps\")\n",
" print(\"MPS GPU is enabled.\")\n", " print(\"MPS GPU is enabled.\")\n",
"else:\n", "else:\n",
" raise EnvironmentError(\n", " raise OSError(\n",
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n", " \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
" )" " )"
] ]
@ -226,7 +226,6 @@
} }
], ],
"source": [ "source": [
"from docling.datamodel.document import ConversionResult\n",
"from docling.document_converter import DocumentConverter\n", "from docling.document_converter import DocumentConverter\n",
"\n", "\n",
"# Instantiate the doc converter\n", "# Instantiate the doc converter\n",
@ -345,7 +344,7 @@
"\n", "\n",
" openai_api_key = os.getenv(openai_api_key_var)\n", " openai_api_key = os.getenv(openai_api_key_var)\n",
" if not openai_api_key:\n", " if not openai_api_key:\n",
" raise EnvironmentError(\n", " raise OSError(\n",
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n", " f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
" \"Please define it before running this script.\"\n", " \"Please define it before running this script.\"\n",
" )" " )"
@ -387,7 +386,6 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import weaviate.classes.config as wc\n", "import weaviate.classes.config as wc\n",
"from weaviate.classes.config import DataType, Property\n",
"\n", "\n",
"# Define the collection name\n", "# Define the collection name\n",
"collection_name = \"docling\"\n", "collection_name = \"docling\"\n",

View File

@ -25,7 +25,7 @@ def main():
document = mdb.convert() document = mdb.convert()
out_path = Path("scratch") out_path = Path("scratch")
print(f"Document {path} converted.\nSaved markdown output to: {str(out_path)}") print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
# Export Docling document format to markdowndoc: # Export Docling document format to markdowndoc:
fn = os.path.basename(path) fn = os.path.basename(path)

View File

@ -1,13 +1,10 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
) )
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -63,7 +63,7 @@ def main():
out_path = Path("scratch") out_path = Path("scratch")
print( print(
f"Document {res.input.file.name} converted." f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {out_path!s}"
) )
_log.debug(res.document._export_to_indented_text(max_text_len=16)) _log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc: # Export Docling document format to markdowndoc:

View File

@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -2,9 +2,9 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
ApiVlmOptions, ApiVlmOptions,
ResponseFormat, ResponseFormat,
VlmPipelineOptions, VlmPipelineOptions,
granite_vision_vlm_ollama_conversion_options,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline from docling.pipeline.vlm_pipeline import VlmPipeline

View File

@ -202,12 +202,16 @@ select = [
] ]
ignore = [ ignore = [
"C408", # Unnecessary `dict()` call (rewrite as a literal)
"E501", # Line too long, handled by ruff formatter "E501", # Line too long, handled by ruff formatter
"D107", # "Missing docstring in __init__", "D107", # "Missing docstring in __init__",
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
"F811", # "redefinition of the same function" "F811", # "redefinition of the same function"
"PL", # Pylint "PL", # Pylint
"RUF012", # Mutable Class Attributes "RUF012", # Mutable Class Attributes
"UP006", # List vs list, etc
"UP007", # Option and Union "UP007", # Option and Union
"UP035", # `typing.Set` is deprecated, use `set` instead"
] ]
#extend-select = [] #extend-select = []
@ -217,7 +221,7 @@ ignore = [
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests "tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
[tool.ruff.lint.mccabe] [tool.ruff.lint.mccabe]
max-complexity = 15 max-complexity = 20
# [tool.ruff.lint.isort.sections] # [tool.ruff.lint.isort.sections]
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"] # "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]

View File

@ -37,7 +37,7 @@ def test_asciidocs_examples():
print("\n\n", pred_mddoc) print("\n\n", pred_mddoc)
if os.path.exists(gname): if os.path.exists(gname):
with open(gname, "r") as fr: with open(gname) as fr:
true_mddoc = fr.read() true_mddoc = fr.read()
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"

View File

@ -1,5 +1,3 @@
import json
import os
from pathlib import Path from pathlib import Path
from pytest import warns from pytest import warns
@ -16,7 +14,7 @@ GENERATE = GEN_TEST_DATA
def get_csv_paths(): def get_csv_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path(f"./tests/data/csv/") directory = Path("./tests/data/csv/")
# List all CSV files in the directory and its subdirectories # List all CSV files in the directory and its subdirectories
return sorted(directory.rglob("*.csv")) return sorted(directory.rglob("*.csv"))

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)

View File

@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
def get_pubmed_paths(): def get_pubmed_paths():
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/") directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
xml_files = sorted(directory.rglob("*.xml")) xml_files = sorted(directory.rglob("*.xml"))
return xml_files return xml_files

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend

View File

@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
assert isinstance(texts[2], TextItem) assert isinstance(texts[2], TextItem)
assert texts[2].text == ( assert texts[2].text == (
"An interleaver receives incoming data frames of size N. The interleaver " "An interleaver receives incoming data frames of size N. The interleaver "
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " "indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
"then effectively rearranges (permutes) the data by permuting the rows of the " "then effectively rearranges (permutes) the data by permuting the rows of the "
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
"permute the columns (indexed by k) of each row (indexed by j). P is at least " "permute the columns (indexed by k) of each row (indexed by j). P is at least "
"equal to N₂, βj is a constant which may be different for each row, and each " "equal to N₂, βj is a constant which may be different for each row, and each "
"αj is a relative prime number relative to P. After permuting, the " "αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
"interleaver outputs the data in a different order than received (e.g., " "interleaver outputs the data in a different order than received (e.g., "
"receives sequentially row by row, outputs sequentially each column by column)." "receives sequentially row by row, outputs sequentially each column by column)."
) )

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path from pathlib import Path
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat

View File

@ -3,7 +3,6 @@ from pathlib import Path
from docling_core.types.doc import CodeItem, TextItem from docling_core.types.doc import CodeItem, TextItem
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -2,7 +2,6 @@ from pathlib import Path
from docling_core.types.doc import PictureClassificationData from docling_core.types.doc import PictureClassificationData
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions

View File

@ -1,7 +1,6 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions

View File

@ -3,7 +3,6 @@ from pathlib import Path
from typing import List from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (

View File

@ -4,7 +4,6 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -3,8 +3,6 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -216,7 +216,7 @@ def verify_picture_image_v2(
def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool): def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool):
assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match." assert len(doc_pred.texts) == len(doc_true.texts), "Text lengths do not match."
assert len(doc_true.tables) == len(doc_pred.tables), ( assert len(doc_true.tables) == len(doc_pred.tables), (
"document has different count of tables than expected." "document has different count of tables than expected."
@ -230,7 +230,7 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
assert isinstance(pred_item, DocItem), "Test item is not a DocItem" assert isinstance(pred_item, DocItem), "Test item is not a DocItem"
# Validate type # Validate type
assert true_item.label == pred_item.label, f"Object label does not match." assert true_item.label == pred_item.label, "Object label does not match."
# Validate provenance # Validate provenance
assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch" assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch"
@ -337,16 +337,16 @@ def verify_conversion_result_v1(
with open(dt_path, "w") as fw: with open(dt_path, "w") as fw:
fw.write(doc_pred_dt) fw.write(doc_pred_dt)
else: # default branch in test else: # default branch in test
with open(pages_path, "r") as fr: with open(pages_path) as fr:
doc_true_pages = PageList.validate_json(fr.read()) doc_true_pages = PageList.validate_json(fr.read())
with open(json_path, "r") as fr: with open(json_path) as fr:
doc_true: DsDocument = DsDocument.model_validate_json(fr.read()) doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
with open(md_path, "r") as fr: with open(md_path) as fr:
doc_true_md = fr.read() doc_true_md = fr.read()
with open(dt_path, "r") as fr: with open(dt_path) as fr:
doc_true_dt = fr.read() doc_true_dt = fr.read()
if not fuzzy: if not fuzzy:
@ -419,16 +419,16 @@ def verify_conversion_result_v2(
with open(dt_path, "w") as fw: with open(dt_path, "w") as fw:
fw.write(doc_pred_dt) fw.write(doc_pred_dt)
else: # default branch in test else: # default branch in test
with open(pages_path, "r") as fr: with open(pages_path) as fr:
doc_true_pages = PageList.validate_json(fr.read()) doc_true_pages = PageList.validate_json(fr.read())
with open(json_path, "r") as fr: with open(json_path) as fr:
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read()) doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
with open(md_path, "r") as fr: with open(md_path) as fr:
doc_true_md = fr.read() doc_true_md = fr.read()
with open(dt_path, "r") as fr: with open(dt_path) as fr:
doc_true_dt = fr.read() doc_true_dt = fr.read()
if not fuzzy: if not fuzzy: