mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
apply ruff lint fixes
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
d74e407526
commit
73cec158c6
@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.lines = text_stream.split("\n")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
self.lines = f.readlines()
|
||||
self.valid = True
|
||||
|
||||
@ -75,7 +75,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return doc
|
||||
|
||||
def _parse(self, doc: DoclingDocument):
|
||||
def _parse(self, doc: DoclingDocument): # noqa: C901
|
||||
"""
|
||||
Main function that orchestrates the parsing by yielding components:
|
||||
title, section headers, text, lists, and tables.
|
||||
@ -95,7 +95,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
indents: dict[int, Union[GroupItem, None]] = {}
|
||||
|
||||
for i in range(0, 10):
|
||||
for i in range(10):
|
||||
parents[i] = None
|
||||
indents[i] = None
|
||||
|
||||
|
@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
head = self.content.readline()
|
||||
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
||||
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
||||
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
|
||||
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
||||
)
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
@ -1,14 +1,14 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
||||
from PIL import Image, ImageDraw
|
||||
from PIL import Image
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
|
@ -1,12 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||
On 23/01/2025
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||
|
||||
BLANK = ""
|
||||
@ -79,7 +75,6 @@ CHR_BO = {
|
||||
}
|
||||
|
||||
T = {
|
||||
"\u2192": "\\rightarrow ",
|
||||
# Greek letters
|
||||
"\U0001d6fc": "\\alpha ",
|
||||
"\U0001d6fd": "\\beta ",
|
||||
|
@ -76,7 +76,7 @@ def get_val(key, default=None, store=CHR):
|
||||
return default
|
||||
|
||||
|
||||
class Tag2Method(object):
|
||||
class Tag2Method:
|
||||
def call_method(self, elm, stag=None):
|
||||
getmethod = self.tag2meth.get
|
||||
if stag is None:
|
||||
@ -157,7 +157,7 @@ class Pr(Tag2Method):
|
||||
def do_common(self, elm):
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
if stag in self.__val_tags:
|
||||
t = elm.get("{0}val".format(OMML_NS))
|
||||
t = elm.get(f"{OMML_NS}val")
|
||||
self.__innerdict[stag] = t
|
||||
return None
|
||||
|
||||
@ -246,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
the Pre-Sub-Superscript object -- Not support yet
|
||||
"""
|
||||
pass
|
||||
|
||||
def do_sub(self, elm):
|
||||
text = self.process_children(elm)
|
||||
@ -329,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||
if not latex_s:
|
||||
raise NotSupport("Not support lim %s" % t_dict["e"])
|
||||
raise RuntimeError("Not support lim %s" % t_dict["e"])
|
||||
else:
|
||||
return latex_s.format(lim=t_dict.get("lim"))
|
||||
|
||||
@ -411,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
_str = []
|
||||
_base_str = []
|
||||
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
||||
found_text = elm.findtext(f"./{OMML_NS}t")
|
||||
if found_text:
|
||||
for s in found_text:
|
||||
out_latex_str = self.process_unicode(s)
|
||||
|
@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.max_levels = 10
|
||||
self.level = 0
|
||||
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
||||
for i in range(0, self.max_levels):
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
|
||||
try:
|
||||
@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.analyze_tag(cast(Tag, element), doc)
|
||||
except Exception as exc_child:
|
||||
_log.error(
|
||||
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
|
||||
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
||||
)
|
||||
raise exc_child
|
||||
elif isinstance(element, NavigableString) and not isinstance(
|
||||
@ -357,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_label == GroupLabel.ORDERED_LIST:
|
||||
marker = f"{str(index_in_list)}."
|
||||
marker = f"{index_in_list!s}."
|
||||
enumerated = True
|
||||
doc.add_list_item(
|
||||
text=text,
|
||||
|
@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# otherwise they represent emphasis (bold or italic)
|
||||
self.markdown = self._shorten_underscore_sequences(text_stream)
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
md_content = f.read()
|
||||
# remove invalid sequences
|
||||
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||
@ -235,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
parent_item = doc.add_group(
|
||||
label=label, name=f"list", parent=parent_item
|
||||
label=label, name="list", parent=parent_item
|
||||
)
|
||||
|
||||
elif (
|
||||
@ -319,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._html_blocks += 1
|
||||
self._process_inline_text(parent_item, doc)
|
||||
self._close_table(doc)
|
||||
_log.debug("HTML Block: {}".format(element))
|
||||
_log.debug(f"HTML Block: {element}")
|
||||
if (
|
||||
len(element.body) > 0
|
||||
): # If Marko doesn't return any content for HTML block, skip it
|
||||
@ -331,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self._close_table(doc)
|
||||
_log.debug("Some other element: {}".format(element))
|
||||
_log.debug(f"Some other element: {element}")
|
||||
|
||||
processed_block_types = (
|
||||
marko.block.Heading,
|
||||
|
@ -120,7 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
return prov
|
||||
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
||||
is_a_list = False
|
||||
is_list_group_created = False
|
||||
enum_list_item_value = 0
|
||||
@ -243,7 +243,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
if not is_list_group_created:
|
||||
new_list = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_slide
|
||||
label=list_label, name="list", parent=parent_slide
|
||||
)
|
||||
is_list_group_created = True
|
||||
doc.add_list_item(
|
||||
@ -372,7 +372,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
max_levels = 10
|
||||
parents = {} # type: ignore
|
||||
for i in range(0, max_levels):
|
||||
for i in range(max_levels):
|
||||
parents[i] = None
|
||||
|
||||
# Loop through each slide
|
||||
|
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
||||
)
|
||||
if cell is None or cell._tc in cell_set:
|
||||
_log.debug(f" skipped since repeated content")
|
||||
_log.debug(" skipped since repeated content")
|
||||
col_idx += cell.grid_span
|
||||
continue
|
||||
else:
|
||||
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError) as e:
|
||||
except (UnidentifiedImageError, OSError):
|
||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
|
@ -1,7 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Set, Union
|
||||
from typing import Optional, Set, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, Size
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
self.valid = True # No better way to tell from pypdfium.
|
||||
try:
|
||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||
except PdfiumError as e:
|
||||
except PdfiumError:
|
||||
_log.info(
|
||||
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
||||
exc_info=True,
|
||||
|
@ -348,7 +348,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return
|
||||
|
||||
def _parse_element_citation(self, node: etree._Element) -> str:
|
||||
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
||||
citation: Citation = {
|
||||
"author_names": "",
|
||||
"title": "",
|
||||
@ -439,7 +439,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
||||
if len(node.xpath("lpage")) > 0:
|
||||
citation["page"] += (
|
||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
||||
)
|
||||
|
||||
# Flatten the citation to string
|
||||
@ -594,9 +594,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
try:
|
||||
self._add_table(doc, parent, table)
|
||||
except Exception as e:
|
||||
_log.warning(f"Skipping unsupported table in {str(self.file)}")
|
||||
pass
|
||||
except Exception:
|
||||
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
||||
|
||||
return
|
||||
|
||||
|
@ -162,7 +162,6 @@ class PatentUspto(ABC):
|
||||
Returns:
|
||||
The patent parsed as a docling document.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PatentUsptoIce(PatentUspto):
|
||||
@ -264,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@ -280,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@ -314,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@ -602,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@ -615,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@ -649,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@ -690,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
if tag in [member.value for member in self.Element]:
|
||||
if (
|
||||
tag == self.Element.HEADING.value
|
||||
and not self.Element.SDOCL.value in self.property
|
||||
and self.Element.SDOCL.value not in self.property
|
||||
):
|
||||
level_attr: str = attributes.get("LVL", "")
|
||||
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
||||
@ -742,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
# headers except claims statement
|
||||
elif (
|
||||
self.Element.HEADING.value in self.property
|
||||
and not self.Element.SDOCL.value in self.property
|
||||
and self.Element.SDOCL.value not in self.property
|
||||
and text.strip()
|
||||
):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
@ -1163,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@ -1176,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@ -1210,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@ -1526,7 +1525,7 @@ class XmlTable:
|
||||
|
||||
return ncols_max
|
||||
|
||||
def _parse_table(self, table: Tag) -> TableData:
|
||||
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
||||
"""Parse the content of a table tag.
|
||||
|
||||
Args:
|
||||
@ -1721,7 +1720,7 @@ class HtmlEntity:
|
||||
"0": "⁰",
|
||||
"+": "⁺",
|
||||
"-": "⁻",
|
||||
"−": "⁻",
|
||||
"−": "⁻", # noqa: RUF001
|
||||
"=": "⁼",
|
||||
"(": "⁽",
|
||||
")": "⁾",
|
||||
@ -1745,7 +1744,7 @@ class HtmlEntity:
|
||||
"0": "₀",
|
||||
"+": "₊",
|
||||
"-": "₋",
|
||||
"−": "₋",
|
||||
"−": "₋", # noqa: RUF001
|
||||
"=": "₌",
|
||||
"(": "₍",
|
||||
")": "₎",
|
||||
|
@ -6,8 +6,9 @@ import sys
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||
from typing import Annotated, Dict, List, Optional, Type
|
||||
|
||||
import rich.table
|
||||
import typer
|
||||
@ -288,7 +289,7 @@ def convert(
|
||||
...,
|
||||
help=(
|
||||
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
||||
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
|
||||
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
|
||||
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
||||
),
|
||||
),
|
||||
|
@ -62,7 +62,7 @@ def download(
|
||||
models: Annotated[
|
||||
Optional[list[_AvailableModels]],
|
||||
typer.Argument(
|
||||
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
|
||||
help="Models to download (default behavior: a predefined set of models will be downloaded).",
|
||||
),
|
||||
] = None,
|
||||
all: Annotated[
|
||||
|
@ -10,7 +10,7 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
from docling_core.types.io import (
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
@ -243,7 +243,7 @@ class Page(BaseModel):
|
||||
if self._backend is None:
|
||||
return self._image_cache.get(scale, None)
|
||||
|
||||
if not scale in self._image_cache:
|
||||
if scale not in self._image_cache:
|
||||
if cropbox is None:
|
||||
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
||||
else:
|
||||
|
@ -1,13 +1,13 @@
|
||||
import csv
|
||||
import logging
|
||||
import re
|
||||
from collections.abc import Iterable
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
@ -18,31 +18,9 @@ from typing import (
|
||||
|
||||
import filetype
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
PictureItem,
|
||||
SectionHeaderItem,
|
||||
TableItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ListItem
|
||||
from docling_core.types.legacy_doc.base import (
|
||||
BaseText,
|
||||
Figure,
|
||||
GlmTableCell,
|
||||
PageDimensions,
|
||||
PageReference,
|
||||
Prov,
|
||||
Ref,
|
||||
)
|
||||
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
||||
from docling_core.types.legacy_doc.base import TableCell
|
||||
from docling_core.types.legacy_doc.document import (
|
||||
CCSDocumentDescription as DsDocumentDescription,
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from docling_core.utils.legacy import docling_document_to_legacy
|
||||
from pydantic import BaseModel
|
||||
@ -65,7 +43,7 @@ from docling.datamodel.base_models import (
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling.utils.utils import create_file_hash, create_hash
|
||||
from docling.utils.utils import create_file_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.document_converter import FormatOption
|
||||
|
@ -1,11 +1,11 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
import time
|
||||
from collections.abc import Iterable, Iterator
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
@ -254,7 +254,7 @@ class DocumentConverter:
|
||||
|
||||
if not had_result and raises_on_error:
|
||||
raise ConversionError(
|
||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
)
|
||||
|
||||
def _convert(
|
||||
@ -266,7 +266,7 @@ class DocumentConverter:
|
||||
conv_input.docs(self.format_to_options),
|
||||
settings.perf.doc_batch_size, # pass format_options
|
||||
):
|
||||
_log.info(f"Going to convert document batch...")
|
||||
_log.info("Going to convert document batch...")
|
||||
|
||||
# parallel processing only within input_batch
|
||||
# with ThreadPoolExecutor(
|
||||
|
@ -1,4 +1,4 @@
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
|
@ -1,5 +1,6 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Generic, Iterable, Optional, Protocol, Type
|
||||
from collections.abc import Iterable
|
||||
from typing import Generic, Optional, Protocol, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||
from typing_extensions import TypeVar
|
||||
|
@ -1,12 +1,12 @@
|
||||
import copy
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Type
|
||||
from typing import List, Optional, Type
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import binary_dilation, find_objects, label
|
||||
|
@ -1,7 +1,8 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import warnings
|
||||
import zipfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Type
|
||||
from typing import List, Optional, Type
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -98,8 +99,10 @@ class EasyOcrModel(BaseOcrModel):
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
||||
from easyocr.config import detection_models as det_models_dict
|
||||
from easyocr.config import recognition_models as rec_models_dict
|
||||
from easyocr.config import (
|
||||
detection_models as det_models_dict,
|
||||
recognition_models as rec_models_dict,
|
||||
)
|
||||
|
||||
if local_dir is None:
|
||||
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
||||
|
@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
||||
factory = OcrFactory()
|
||||
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
||||
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
||||
return factory
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def get_picture_description_factory(
|
||||
allow_external_plugins: bool = False,
|
||||
) -> PictureDescriptionFactory:
|
||||
|
@ -1,18 +1,16 @@
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
from typing import Optional
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -1,16 +1,15 @@
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
from typing import Optional
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
@ -41,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
||||
device = decide_device(accelerator_options.device)
|
||||
self.device = device
|
||||
|
||||
_log.debug("Available device for HuggingFace VLM: {}".format(device))
|
||||
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
||||
|
||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
import copy
|
||||
import logging
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
from docling_core.types.doc import DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import sys
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Tuple, Type
|
||||
from typing import Optional, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
|
||||
|
||||
if self.enabled:
|
||||
if "darwin" != sys.platform:
|
||||
raise RuntimeError(f"OcrMac is only supported on Mac.")
|
||||
raise RuntimeError("OcrMac is only supported on Mac.")
|
||||
install_errmsg = (
|
||||
"ocrmac is not correctly installed. "
|
||||
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
||||
|
@ -1,6 +1,7 @@
|
||||
import logging
|
||||
import re
|
||||
from typing import Iterable, List
|
||||
from collections.abc import Iterable
|
||||
from typing import List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
|
||||
sanitized_text = "".join(lines)
|
||||
|
||||
# Text normalization
|
||||
sanitized_text = sanitized_text.replace("⁄", "/")
|
||||
sanitized_text = sanitized_text.replace("’", "'")
|
||||
sanitized_text = sanitized_text.replace("‘", "'")
|
||||
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
||||
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
||||
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
||||
sanitized_text = sanitized_text.replace("“", '"')
|
||||
sanitized_text = sanitized_text.replace("”", '"')
|
||||
sanitized_text = sanitized_text.replace("•", "·")
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
from typing import Optional
|
||||
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
@ -1,12 +1,11 @@
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, List, Optional, Type, Union
|
||||
from typing import List, Optional, Type, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
NodeItem,
|
||||
PictureClassificationClass,
|
||||
PictureItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type
|
||||
from typing import Optional, Type
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
@ -1,12 +1,7 @@
|
||||
import copy
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
CoordOrigin,
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
@ -17,13 +12,10 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from docling_core.types.legacy_doc.base import Ref
|
||||
from docling_core.types.legacy_doc.document import BaseText
|
||||
from docling_ibm_models.reading_order.reading_order_rb import (
|
||||
PageElement as ReadingOrderPageElement,
|
||||
ReadingOrderPredictor,
|
||||
)
|
||||
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
|
||||
|
@ -1,13 +1,13 @@
|
||||
import copy
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
SegmentedPdfPage,
|
||||
TextCellUnit,
|
||||
)
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
|
@ -3,9 +3,10 @@ import io
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import Iterable, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
@ -1,6 +1,7 @@
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type
|
||||
from typing import Optional, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
@ -3,9 +3,10 @@ import logging
|
||||
import time
|
||||
import traceback
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Iterable, List
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Callable, List
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
from docling_core.types.doc import NodeItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
total_elapsed_time = 0.0
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
for i in range(0, conv_res.input.page_count):
|
||||
for i in range(conv_res.input.page_count):
|
||||
start_page, end_page = conv_res.input.limits.page_range
|
||||
if (start_page - 1) <= i <= (end_page - 1):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
@ -1,5 +1,4 @@
|
||||
import logging
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Optional, cast
|
||||
|
@ -1,5 +1,4 @@
|
||||
import logging
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union, cast
|
||||
|
@ -1,8 +1,8 @@
|
||||
import logging
|
||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
|
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
||||
return unique_objects
|
||||
|
||||
|
||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
||||
origin = DocumentOrigin(
|
||||
mimetype="application/pdf",
|
||||
filename=doc_glm["file-info"]["filename"],
|
||||
|
@ -18,7 +18,7 @@ class UnionFind:
|
||||
|
||||
def __init__(self, elements):
|
||||
self.parent = {elem: elem for elem in elements}
|
||||
self.rank = {elem: 0 for elem in elements}
|
||||
self.rank = dict.fromkeys(elements, 0)
|
||||
|
||||
def find(self, x):
|
||||
if self.parent[x] != x:
|
||||
|
@ -37,7 +37,7 @@ def download_models(
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
if with_layout:
|
||||
_log.info(f"Downloading layout model...")
|
||||
_log.info("Downloading layout model...")
|
||||
LayoutModel.download_models(
|
||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||
force=force,
|
||||
@ -45,7 +45,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_tableformer:
|
||||
_log.info(f"Downloading tableformer model...")
|
||||
_log.info("Downloading tableformer model...")
|
||||
TableStructureModel.download_models(
|
||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||
force=force,
|
||||
@ -53,7 +53,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_picture_classifier:
|
||||
_log.info(f"Downloading picture classifier model...")
|
||||
_log.info("Downloading picture classifier model...")
|
||||
DocumentPictureClassifier.download_models(
|
||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||
force=force,
|
||||
@ -61,7 +61,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_code_formula:
|
||||
_log.info(f"Downloading code formula model...")
|
||||
_log.info("Downloading code formula model...")
|
||||
CodeFormulaModel.download_models(
|
||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||
force=force,
|
||||
@ -69,7 +69,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_smolvlm:
|
||||
_log.info(f"Downloading SmolVlm model...")
|
||||
_log.info("Downloading SmolVlm model...")
|
||||
PictureDescriptionVlmModel.download_models(
|
||||
repo_id=smolvlm_picture_description.repo_id,
|
||||
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
||||
@ -78,7 +78,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_granite_vision:
|
||||
_log.info(f"Downloading Granite Vision model...")
|
||||
_log.info("Downloading Granite Vision model...")
|
||||
PictureDescriptionVlmModel.download_models(
|
||||
repo_id=granite_picture_description.repo_id,
|
||||
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
||||
@ -87,7 +87,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_easyocr:
|
||||
_log.info(f"Downloading easyocr models...")
|
||||
_log.info("Downloading easyocr models...")
|
||||
EasyOcrModel.download_models(
|
||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||
force=force,
|
||||
|
@ -383,7 +383,7 @@
|
||||
"\n",
|
||||
"print(f\"Downloading {url}...\")\n",
|
||||
"buf = BytesIO(requests.get(url).content)\n",
|
||||
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||
"with zipfile.ZipFile(buf) as zf:\n",
|
||||
" res = zf.testzip()\n",
|
||||
" if res:\n",
|
||||
|
@ -1,8 +1,8 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import yaml
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -3,7 +3,6 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.ocr_mac_model import OcrMacOptions
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -3,8 +3,8 @@
|
||||
# It does not run the actual formula understanding model.
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
||||
|
||||
|
@ -3,8 +3,9 @@
|
||||
# It does not run the actual picture classifier model.
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
from typing import Any
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
@ -1,14 +1,9 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
@ -153,10 +153,10 @@
|
||||
"source": [
|
||||
"for i, chunk in enumerate(chunk_iter):\n",
|
||||
" print(f\"=== {i} ===\")\n",
|
||||
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
|
||||
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
|
||||
"\n",
|
||||
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
||||
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
|
||||
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
|
||||
"\n",
|
||||
" print()"
|
||||
]
|
||||
@ -353,11 +353,11 @@
|
||||
"for i, chunk in enumerate(chunks):\n",
|
||||
" print(f\"=== {i} ===\")\n",
|
||||
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
|
||||
"\n",
|
||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
||||
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
|
||||
"\n",
|
||||
" print()"
|
||||
]
|
||||
|
@ -2,17 +2,11 @@ import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
VlmPipelineOptions,
|
||||
granite_vision_vlm_conversion_options,
|
||||
smoldocling_vlm_conversion_options,
|
||||
smoldocling_vlm_mlx_conversion_options,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
|
||||
for source in sources:
|
||||
start_time = time.time()
|
||||
print("================================================")
|
||||
print("Processing... {}".format(source))
|
||||
print(f"Processing... {source}")
|
||||
print("================================================")
|
||||
print("")
|
||||
|
||||
@ -77,7 +71,7 @@ for source in sources:
|
||||
print(page.predictions.vlm_response.text)
|
||||
|
||||
res.document.save_as_html(
|
||||
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
|
||||
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
|
||||
image_mode=ImageRefMode.REFERENCED,
|
||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||
)
|
||||
|
@ -144,7 +144,7 @@
|
||||
"for pic in doc.pictures[:5]:\n",
|
||||
" html_item = (\n",
|
||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
||||
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||
" )\n",
|
||||
" for annotation in pic.annotations:\n",
|
||||
@ -252,7 +252,7 @@
|
||||
"for pic in doc.pictures[:5]:\n",
|
||||
" html_item = (\n",
|
||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
||||
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||
" )\n",
|
||||
" for annotation in pic.annotations:\n",
|
||||
|
@ -351,7 +351,7 @@
|
||||
"for source in sources:\n",
|
||||
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
||||
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
|
||||
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
|
||||
" print(f\"- text: {doc_chunk.text!r}\")\n",
|
||||
" if doc_chunk.meta.origin:\n",
|
||||
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
|
||||
" if doc_chunk.meta.headings:\n",
|
||||
|
@ -119,7 +119,7 @@
|
||||
" device = torch.device(\"mps\")\n",
|
||||
" print(\"MPS GPU is enabled.\")\n",
|
||||
"else:\n",
|
||||
" raise EnvironmentError(\n",
|
||||
" raise OSError(\n",
|
||||
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
||||
" )"
|
||||
]
|
||||
@ -226,7 +226,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from docling.datamodel.document import ConversionResult\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# Instantiate the doc converter\n",
|
||||
@ -345,7 +344,7 @@
|
||||
"\n",
|
||||
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
||||
" if not openai_api_key:\n",
|
||||
" raise EnvironmentError(\n",
|
||||
" raise OSError(\n",
|
||||
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
||||
" \"Please define it before running this script.\"\n",
|
||||
" )"
|
||||
@ -387,7 +386,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import weaviate.classes.config as wc\n",
|
||||
"from weaviate.classes.config import DataType, Property\n",
|
||||
"\n",
|
||||
"# Define the collection name\n",
|
||||
"collection_name = \"docling\"\n",
|
||||
|
@ -25,7 +25,7 @@ def main():
|
||||
document = mdb.convert()
|
||||
|
||||
out_path = Path("scratch")
|
||||
print(f"Document {path} converted.\nSaved markdown output to: {str(out_path)}")
|
||||
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
|
||||
|
||||
# Export Docling document format to markdowndoc:
|
||||
fn = os.path.basename(path)
|
||||
|
@ -1,13 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
@ -63,7 +63,7 @@ def main():
|
||||
out_path = Path("scratch")
|
||||
print(
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
f"\nSaved markdown output to: {out_path!s}"
|
||||
)
|
||||
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||
# Export Docling document format to markdowndoc:
|
||||
|
@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
@ -2,9 +2,9 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
|
||||
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
|
||||
ApiVlmOptions,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
granite_vision_vlm_ollama_conversion_options,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
@ -202,12 +202,16 @@ select = [
|
||||
]
|
||||
|
||||
ignore = [
|
||||
"C408", # Unnecessary `dict()` call (rewrite as a literal)
|
||||
"E501", # Line too long, handled by ruff formatter
|
||||
"D107", # "Missing docstring in __init__",
|
||||
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
|
||||
"F811", # "redefinition of the same function"
|
||||
"PL", # Pylint
|
||||
"RUF012", # Mutable Class Attributes
|
||||
"UP006", # List vs list, etc
|
||||
"UP007", # Option and Union
|
||||
"UP035", # `typing.Set` is deprecated, use `set` instead"
|
||||
]
|
||||
|
||||
#extend-select = []
|
||||
@ -217,7 +221,7 @@ ignore = [
|
||||
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
||||
|
||||
[tool.ruff.lint.mccabe]
|
||||
max-complexity = 15
|
||||
max-complexity = 20
|
||||
|
||||
# [tool.ruff.lint.isort.sections]
|
||||
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
|
||||
|
@ -37,7 +37,7 @@ def test_asciidocs_examples():
|
||||
print("\n\n", pred_mddoc)
|
||||
|
||||
if os.path.exists(gname):
|
||||
with open(gname, "r") as fr:
|
||||
with open(gname) as fr:
|
||||
true_mddoc = fr.read()
|
||||
|
||||
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||
|
@ -1,5 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from pytest import warns
|
||||
@ -16,7 +14,7 @@ GENERATE = GEN_TEST_DATA
|
||||
|
||||
def get_csv_paths():
|
||||
# Define the directory you want to search
|
||||
directory = Path(f"./tests/data/csv/")
|
||||
directory = Path("./tests/data/csv/")
|
||||
|
||||
# List all CSV files in the directory and its subdirectories
|
||||
return sorted(directory.rglob("*.csv"))
|
||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
||||
|
@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def get_pubmed_paths():
|
||||
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
|
||||
directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
|
||||
xml_files = sorted(directory.rglob("*.xml"))
|
||||
return xml_files
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
|
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
|
||||
assert isinstance(texts[2], TextItem)
|
||||
assert texts[2].text == (
|
||||
"An interleaver receives incoming data frames of size N. The interleaver "
|
||||
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
|
||||
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
|
||||
"then effectively rearranges (permutes) the data by permuting the rows of the "
|
||||
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
|
||||
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
|
||||
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
|
||||
"equal to N₂, βj is a constant which may be different for each row, and each "
|
||||
"αj is a relative prime number relative to P. After permuting, the "
|
||||
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
|
||||
"interleaver outputs the data in a different order than received (e.g., "
|
||||
"receives sequentially row by row, outputs sequentially each column by column)."
|
||||
)
|
||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
@ -1,4 +1,3 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
||||
from docling_core.types.doc import CodeItem, TextItem
|
||||
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
@ -2,7 +2,6 @@ from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import PictureClassificationData
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
@ -1,7 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
|
@ -4,7 +4,6 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
@ -3,8 +3,6 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
@ -216,7 +216,7 @@ def verify_picture_image_v2(
|
||||
|
||||
|
||||
def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool):
|
||||
assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match."
|
||||
assert len(doc_pred.texts) == len(doc_true.texts), "Text lengths do not match."
|
||||
|
||||
assert len(doc_true.tables) == len(doc_pred.tables), (
|
||||
"document has different count of tables than expected."
|
||||
@ -230,7 +230,7 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
|
||||
assert isinstance(pred_item, DocItem), "Test item is not a DocItem"
|
||||
|
||||
# Validate type
|
||||
assert true_item.label == pred_item.label, f"Object label does not match."
|
||||
assert true_item.label == pred_item.label, "Object label does not match."
|
||||
|
||||
# Validate provenance
|
||||
assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch"
|
||||
@ -337,16 +337,16 @@ def verify_conversion_result_v1(
|
||||
with open(dt_path, "w") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
with open(pages_path, "r") as fr:
|
||||
with open(pages_path) as fr:
|
||||
doc_true_pages = PageList.validate_json(fr.read())
|
||||
|
||||
with open(json_path, "r") as fr:
|
||||
with open(json_path) as fr:
|
||||
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
||||
|
||||
with open(md_path, "r") as fr:
|
||||
with open(md_path) as fr:
|
||||
doc_true_md = fr.read()
|
||||
|
||||
with open(dt_path, "r") as fr:
|
||||
with open(dt_path) as fr:
|
||||
doc_true_dt = fr.read()
|
||||
|
||||
if not fuzzy:
|
||||
@ -419,16 +419,16 @@ def verify_conversion_result_v2(
|
||||
with open(dt_path, "w") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
with open(pages_path, "r") as fr:
|
||||
with open(pages_path) as fr:
|
||||
doc_true_pages = PageList.validate_json(fr.read())
|
||||
|
||||
with open(json_path, "r") as fr:
|
||||
with open(json_path) as fr:
|
||||
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
with open(md_path, "r") as fr:
|
||||
with open(md_path) as fr:
|
||||
doc_true_md = fr.read()
|
||||
|
||||
with open(dt_path, "r") as fr:
|
||||
with open(dt_path) as fr:
|
||||
doc_true_dt = fr.read()
|
||||
|
||||
if not fuzzy:
|
||||
|
Loading…
Reference in New Issue
Block a user