mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
apply ruff lint fixes
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
d74e407526
commit
73cec158c6
@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
self.lines = text_stream.split("\n")
|
self.lines = text_stream.split("\n")
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||||
self.lines = f.readlines()
|
self.lines = f.readlines()
|
||||||
self.valid = True
|
self.valid = True
|
||||||
|
|
||||||
@ -75,7 +75,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _parse(self, doc: DoclingDocument):
|
def _parse(self, doc: DoclingDocument): # noqa: C901
|
||||||
"""
|
"""
|
||||||
Main function that orchestrates the parsing by yielding components:
|
Main function that orchestrates the parsing by yielding components:
|
||||||
title, section headers, text, lists, and tables.
|
title, section headers, text, lists, and tables.
|
||||||
@ -95,7 +95,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||||
indents: dict[int, Union[GroupItem, None]] = {}
|
indents: dict[int, Union[GroupItem, None]] = {}
|
||||||
|
|
||||||
for i in range(0, 10):
|
for i in range(10):
|
||||||
parents[i] = None
|
parents[i] = None
|
||||||
indents[i] = None
|
indents[i] = None
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
head = self.content.readline()
|
head = self.content.readline()
|
||||||
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
||||||
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
||||||
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
|
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
||||||
)
|
)
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, List, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image
|
||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
|
@ -1,12 +1,8 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||||
On 23/01/2025
|
On 23/01/2025
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||||
|
|
||||||
BLANK = ""
|
BLANK = ""
|
||||||
@ -79,7 +75,6 @@ CHR_BO = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
T = {
|
T = {
|
||||||
"\u2192": "\\rightarrow ",
|
|
||||||
# Greek letters
|
# Greek letters
|
||||||
"\U0001d6fc": "\\alpha ",
|
"\U0001d6fc": "\\alpha ",
|
||||||
"\U0001d6fd": "\\beta ",
|
"\U0001d6fd": "\\beta ",
|
||||||
|
@ -76,7 +76,7 @@ def get_val(key, default=None, store=CHR):
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
class Tag2Method(object):
|
class Tag2Method:
|
||||||
def call_method(self, elm, stag=None):
|
def call_method(self, elm, stag=None):
|
||||||
getmethod = self.tag2meth.get
|
getmethod = self.tag2meth.get
|
||||||
if stag is None:
|
if stag is None:
|
||||||
@ -157,7 +157,7 @@ class Pr(Tag2Method):
|
|||||||
def do_common(self, elm):
|
def do_common(self, elm):
|
||||||
stag = elm.tag.replace(OMML_NS, "")
|
stag = elm.tag.replace(OMML_NS, "")
|
||||||
if stag in self.__val_tags:
|
if stag in self.__val_tags:
|
||||||
t = elm.get("{0}val".format(OMML_NS))
|
t = elm.get(f"{OMML_NS}val")
|
||||||
self.__innerdict[stag] = t
|
self.__innerdict[stag] = t
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -246,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
|||||||
"""
|
"""
|
||||||
the Pre-Sub-Superscript object -- Not support yet
|
the Pre-Sub-Superscript object -- Not support yet
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
|
|
||||||
def do_sub(self, elm):
|
def do_sub(self, elm):
|
||||||
text = self.process_children(elm)
|
text = self.process_children(elm)
|
||||||
@ -329,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
|||||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||||
if not latex_s:
|
if not latex_s:
|
||||||
raise NotSupport("Not support lim %s" % t_dict["e"])
|
raise RuntimeError("Not support lim %s" % t_dict["e"])
|
||||||
else:
|
else:
|
||||||
return latex_s.format(lim=t_dict.get("lim"))
|
return latex_s.format(lim=t_dict.get("lim"))
|
||||||
|
|
||||||
@ -411,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
|||||||
"""
|
"""
|
||||||
_str = []
|
_str = []
|
||||||
_base_str = []
|
_base_str = []
|
||||||
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
found_text = elm.findtext(f"./{OMML_NS}t")
|
||||||
if found_text:
|
if found_text:
|
||||||
for s in found_text:
|
for s in found_text:
|
||||||
out_latex_str = self.process_unicode(s)
|
out_latex_str = self.process_unicode(s)
|
||||||
|
@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.max_levels = 10
|
self.max_levels = 10
|
||||||
self.level = 0
|
self.level = 0
|
||||||
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
||||||
for i in range(0, self.max_levels):
|
for i in range(self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.analyze_tag(cast(Tag, element), doc)
|
self.analyze_tag(cast(Tag, element), doc)
|
||||||
except Exception as exc_child:
|
except Exception as exc_child:
|
||||||
_log.error(
|
_log.error(
|
||||||
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
|
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
||||||
)
|
)
|
||||||
raise exc_child
|
raise exc_child
|
||||||
elif isinstance(element, NavigableString) and not isinstance(
|
elif isinstance(element, NavigableString) and not isinstance(
|
||||||
@ -357,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
marker = ""
|
marker = ""
|
||||||
enumerated = False
|
enumerated = False
|
||||||
if parent_label == GroupLabel.ORDERED_LIST:
|
if parent_label == GroupLabel.ORDERED_LIST:
|
||||||
marker = f"{str(index_in_list)}."
|
marker = f"{index_in_list!s}."
|
||||||
enumerated = True
|
enumerated = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
text=text,
|
text=text,
|
||||||
|
@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# otherwise they represent emphasis (bold or italic)
|
# otherwise they represent emphasis (bold or italic)
|
||||||
self.markdown = self._shorten_underscore_sequences(text_stream)
|
self.markdown = self._shorten_underscore_sequences(text_stream)
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||||
md_content = f.read()
|
md_content = f.read()
|
||||||
# remove invalid sequences
|
# remove invalid sequences
|
||||||
# very long sequences of underscores will lead to unnecessary long processing times.
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||||
@ -235,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if has_non_empty_list_items:
|
if has_non_empty_list_items:
|
||||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||||
parent_item = doc.add_group(
|
parent_item = doc.add_group(
|
||||||
label=label, name=f"list", parent=parent_item
|
label=label, name="list", parent=parent_item
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
@ -319,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self._html_blocks += 1
|
self._html_blocks += 1
|
||||||
self._process_inline_text(parent_item, doc)
|
self._process_inline_text(parent_item, doc)
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
_log.debug("HTML Block: {}".format(element))
|
_log.debug(f"HTML Block: {element}")
|
||||||
if (
|
if (
|
||||||
len(element.body) > 0
|
len(element.body) > 0
|
||||||
): # If Marko doesn't return any content for HTML block, skip it
|
): # If Marko doesn't return any content for HTML block, skip it
|
||||||
@ -331,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
_log.debug("Some other element: {}".format(element))
|
_log.debug(f"Some other element: {element}")
|
||||||
|
|
||||||
processed_block_types = (
|
processed_block_types = (
|
||||||
marko.block.Heading,
|
marko.block.Heading,
|
||||||
|
@ -120,7 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
|
|
||||||
return prov
|
return prov
|
||||||
|
|
||||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
||||||
is_a_list = False
|
is_a_list = False
|
||||||
is_list_group_created = False
|
is_list_group_created = False
|
||||||
enum_list_item_value = 0
|
enum_list_item_value = 0
|
||||||
@ -243,7 +243,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
enum_marker = str(enum_list_item_value) + "."
|
enum_marker = str(enum_list_item_value) + "."
|
||||||
if not is_list_group_created:
|
if not is_list_group_created:
|
||||||
new_list = doc.add_group(
|
new_list = doc.add_group(
|
||||||
label=list_label, name=f"list", parent=parent_slide
|
label=list_label, name="list", parent=parent_slide
|
||||||
)
|
)
|
||||||
is_list_group_created = True
|
is_list_group_created = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
@ -372,7 +372,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
|
|
||||||
max_levels = 10
|
max_levels = 10
|
||||||
parents = {} # type: ignore
|
parents = {} # type: ignore
|
||||||
for i in range(0, max_levels):
|
for i in range(max_levels):
|
||||||
parents[i] = None
|
parents[i] = None
|
||||||
|
|
||||||
# Loop through each slide
|
# Loop through each slide
|
||||||
|
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
||||||
)
|
)
|
||||||
if cell is None or cell._tc in cell_set:
|
if cell is None or cell._tc in cell_set:
|
||||||
_log.debug(f" skipped since repeated content")
|
_log.debug(" skipped since repeated content")
|
||||||
col_idx += cell.grid_span
|
col_idx += cell.grid_span
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
caption=None,
|
caption=None,
|
||||||
)
|
)
|
||||||
except (UnidentifiedImageError, OSError) as e:
|
except (UnidentifiedImageError, OSError):
|
||||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Set, Union
|
from typing import Optional, Set, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, Size
|
from docling_core.types.doc import BoundingBox, Size
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, List, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
import pypdfium2.raw as pdfium_c
|
import pypdfium2.raw as pdfium_c
|
||||||
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
self.valid = True # No better way to tell from pypdfium.
|
self.valid = True # No better way to tell from pypdfium.
|
||||||
try:
|
try:
|
||||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||||
except PdfiumError as e:
|
except PdfiumError:
|
||||||
_log.info(
|
_log.info(
|
||||||
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
|
@ -348,7 +348,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_element_citation(self, node: etree._Element) -> str:
|
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
||||||
citation: Citation = {
|
citation: Citation = {
|
||||||
"author_names": "",
|
"author_names": "",
|
||||||
"title": "",
|
"title": "",
|
||||||
@ -439,7 +439,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
||||||
if len(node.xpath("lpage")) > 0:
|
if len(node.xpath("lpage")) > 0:
|
||||||
citation["page"] += (
|
citation["page"] += (
|
||||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
||||||
)
|
)
|
||||||
|
|
||||||
# Flatten the citation to string
|
# Flatten the citation to string
|
||||||
@ -594,9 +594,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
self._add_table(doc, parent, table)
|
self._add_table(doc, parent, table)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
_log.warning(f"Skipping unsupported table in {str(self.file)}")
|
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
||||||
pass
|
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -162,7 +162,6 @@ class PatentUspto(ABC):
|
|||||||
Returns:
|
Returns:
|
||||||
The patent parsed as a docling document.
|
The patent parsed as a docling document.
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class PatentUsptoIce(PatentUspto):
|
class PatentUsptoIce(PatentUspto):
|
||||||
@ -264,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self.style_html = HtmlEntity()
|
self.style_html = HtmlEntity()
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def startElement(self, tag, attributes): # noqa: N802
|
def startElement(self, tag, attributes):
|
||||||
"""Signal the start of an element.
|
"""Signal the start of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -280,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self._start_registered_elements(tag, attributes)
|
self._start_registered_elements(tag, attributes)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def skippedEntity(self, name): # noqa: N802
|
def skippedEntity(self, name):
|
||||||
"""Receive notification of a skipped entity.
|
"""Receive notification of a skipped entity.
|
||||||
|
|
||||||
HTML entities will be skipped by the parser. This method will unescape them
|
HTML entities will be skipped by the parser. This method will unescape them
|
||||||
@ -314,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self.text += unescaped
|
self.text += unescaped
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def endElement(self, tag): # noqa: N802
|
def endElement(self, tag):
|
||||||
"""Signal the end of an element.
|
"""Signal the end of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -602,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self.style_html = HtmlEntity()
|
self.style_html = HtmlEntity()
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def startElement(self, tag, attributes): # noqa: N802
|
def startElement(self, tag, attributes):
|
||||||
"""Signal the start of an element.
|
"""Signal the start of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -615,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self._start_registered_elements(tag, attributes)
|
self._start_registered_elements(tag, attributes)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def skippedEntity(self, name): # noqa: N802
|
def skippedEntity(self, name):
|
||||||
"""Receive notification of a skipped entity.
|
"""Receive notification of a skipped entity.
|
||||||
|
|
||||||
HTML entities will be skipped by the parser. This method will unescape them
|
HTML entities will be skipped by the parser. This method will unescape them
|
||||||
@ -649,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self.text += unescaped
|
self.text += unescaped
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def endElement(self, tag): # noqa: N802
|
def endElement(self, tag):
|
||||||
"""Signal the end of an element.
|
"""Signal the end of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -690,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
if tag in [member.value for member in self.Element]:
|
if tag in [member.value for member in self.Element]:
|
||||||
if (
|
if (
|
||||||
tag == self.Element.HEADING.value
|
tag == self.Element.HEADING.value
|
||||||
and not self.Element.SDOCL.value in self.property
|
and self.Element.SDOCL.value not in self.property
|
||||||
):
|
):
|
||||||
level_attr: str = attributes.get("LVL", "")
|
level_attr: str = attributes.get("LVL", "")
|
||||||
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
||||||
@ -742,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
# headers except claims statement
|
# headers except claims statement
|
||||||
elif (
|
elif (
|
||||||
self.Element.HEADING.value in self.property
|
self.Element.HEADING.value in self.property
|
||||||
and not self.Element.SDOCL.value in self.property
|
and self.Element.SDOCL.value not in self.property
|
||||||
and text.strip()
|
and text.strip()
|
||||||
):
|
):
|
||||||
self.parents[self.level + 1] = self.doc.add_heading(
|
self.parents[self.level + 1] = self.doc.add_heading(
|
||||||
@ -1163,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
self.style_html = HtmlEntity()
|
self.style_html = HtmlEntity()
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def startElement(self, tag, attributes): # noqa: N802
|
def startElement(self, tag, attributes):
|
||||||
"""Signal the start of an element.
|
"""Signal the start of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1176,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
self._start_registered_elements(tag, attributes)
|
self._start_registered_elements(tag, attributes)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def skippedEntity(self, name): # noqa: N802
|
def skippedEntity(self, name):
|
||||||
"""Receive notification of a skipped entity.
|
"""Receive notification of a skipped entity.
|
||||||
|
|
||||||
HTML entities will be skipped by the parser. This method will unescape them
|
HTML entities will be skipped by the parser. This method will unescape them
|
||||||
@ -1210,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
self.text += unescaped
|
self.text += unescaped
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def endElement(self, tag): # noqa: N802
|
def endElement(self, tag):
|
||||||
"""Signal the end of an element.
|
"""Signal the end of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1526,7 +1525,7 @@ class XmlTable:
|
|||||||
|
|
||||||
return ncols_max
|
return ncols_max
|
||||||
|
|
||||||
def _parse_table(self, table: Tag) -> TableData:
|
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
||||||
"""Parse the content of a table tag.
|
"""Parse the content of a table tag.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1721,7 +1720,7 @@ class HtmlEntity:
|
|||||||
"0": "⁰",
|
"0": "⁰",
|
||||||
"+": "⁺",
|
"+": "⁺",
|
||||||
"-": "⁻",
|
"-": "⁻",
|
||||||
"−": "⁻",
|
"−": "⁻", # noqa: RUF001
|
||||||
"=": "⁼",
|
"=": "⁼",
|
||||||
"(": "⁽",
|
"(": "⁽",
|
||||||
")": "⁾",
|
")": "⁾",
|
||||||
@ -1745,7 +1744,7 @@ class HtmlEntity:
|
|||||||
"0": "₀",
|
"0": "₀",
|
||||||
"+": "₊",
|
"+": "₊",
|
||||||
"-": "₋",
|
"-": "₋",
|
||||||
"−": "₋",
|
"−": "₋", # noqa: RUF001
|
||||||
"=": "₌",
|
"=": "₌",
|
||||||
"(": "₍",
|
"(": "₍",
|
||||||
")": "₎",
|
")": "₎",
|
||||||
|
@ -6,8 +6,9 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
from typing import Annotated, Dict, List, Optional, Type
|
||||||
|
|
||||||
import rich.table
|
import rich.table
|
||||||
import typer
|
import typer
|
||||||
@ -288,7 +289,7 @@ def convert(
|
|||||||
...,
|
...,
|
||||||
help=(
|
help=(
|
||||||
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
||||||
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
|
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
|
||||||
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
@ -62,7 +62,7 @@ def download(
|
|||||||
models: Annotated[
|
models: Annotated[
|
||||||
Optional[list[_AvailableModels]],
|
Optional[list[_AvailableModels]],
|
||||||
typer.Argument(
|
typer.Argument(
|
||||||
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
|
help="Models to download (default behavior: a predefined set of models will be downloaded).",
|
||||||
),
|
),
|
||||||
] = None,
|
] = None,
|
||||||
all: Annotated[
|
all: Annotated[
|
||||||
|
@ -10,7 +10,7 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
from docling_core.types.io import (
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
)
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
@ -243,7 +243,7 @@ class Page(BaseModel):
|
|||||||
if self._backend is None:
|
if self._backend is None:
|
||||||
return self._image_cache.get(scale, None)
|
return self._image_cache.get(scale, None)
|
||||||
|
|
||||||
if not scale in self._image_cache:
|
if scale not in self._image_cache:
|
||||||
if cropbox is None:
|
if cropbox is None:
|
||||||
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
||||||
else:
|
else:
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from collections.abc import Iterable
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Dict,
|
Dict,
|
||||||
Iterable,
|
|
||||||
List,
|
List,
|
||||||
Literal,
|
Literal,
|
||||||
Optional,
|
Optional,
|
||||||
@ -18,31 +18,9 @@ from typing import (
|
|||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
PictureItem,
|
|
||||||
SectionHeaderItem,
|
|
||||||
TableItem,
|
|
||||||
TextItem,
|
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import ListItem
|
|
||||||
from docling_core.types.legacy_doc.base import (
|
|
||||||
BaseText,
|
|
||||||
Figure,
|
|
||||||
GlmTableCell,
|
|
||||||
PageDimensions,
|
|
||||||
PageReference,
|
|
||||||
Prov,
|
|
||||||
Ref,
|
|
||||||
)
|
|
||||||
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
|
||||||
from docling_core.types.legacy_doc.base import TableCell
|
|
||||||
from docling_core.types.legacy_doc.document import (
|
|
||||||
CCSDocumentDescription as DsDocumentDescription,
|
|
||||||
)
|
|
||||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
|
||||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
|
||||||
from docling_core.utils.file import resolve_source_to_stream
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
from docling_core.utils.legacy import docling_document_to_legacy
|
from docling_core.utils.legacy import docling_document_to_legacy
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -65,7 +43,7 @@ from docling.datamodel.base_models import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docling.datamodel.settings import DocumentLimits
|
||||||
from docling.utils.profiling import ProfilingItem
|
from docling.utils.profiling import ProfilingItem
|
||||||
from docling.utils.utils import create_file_hash, create_hash
|
from docling.utils.utils import create_file_hash
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.document_converter import FormatOption
|
from docling.document_converter import FormatOption
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import math
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable, Iterator
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||||
|
|
||||||
@ -254,7 +254,7 @@ class DocumentConverter:
|
|||||||
|
|
||||||
if not had_result and raises_on_error:
|
if not had_result and raises_on_error:
|
||||||
raise ConversionError(
|
raise ConversionError(
|
||||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
@ -266,7 +266,7 @@ class DocumentConverter:
|
|||||||
conv_input.docs(self.format_to_options),
|
conv_input.docs(self.format_to_options),
|
||||||
settings.perf.doc_batch_size, # pass format_options
|
settings.perf.doc_batch_size, # pass format_options
|
||||||
):
|
):
|
||||||
_log.info(f"Going to convert document batch...")
|
_log.info("Going to convert document batch...")
|
||||||
|
|
||||||
# parallel processing only within input_batch
|
# parallel processing only within input_batch
|
||||||
# with ThreadPoolExecutor(
|
# with ThreadPoolExecutor(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterable
|
from collections.abc import Iterable
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Generic, Iterable, Optional, Protocol, Type
|
from collections.abc import Iterable
|
||||||
|
from typing import Generic, Optional, Protocol, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional, Type
|
from typing import List, Optional, Type
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import binary_dilation, find_objects, label
|
from scipy.ndimage import binary_dilation, find_objects, label
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
from typing import List, Literal, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
from typing import List, Literal, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional, Type
|
from typing import List, Optional, Type
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -98,8 +99,10 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
progress: bool = False,
|
progress: bool = False,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
||||||
from easyocr.config import detection_models as det_models_dict
|
from easyocr.config import (
|
||||||
from easyocr.config import recognition_models as rec_models_dict
|
detection_models as det_models_dict,
|
||||||
|
recognition_models as rec_models_dict,
|
||||||
|
)
|
||||||
|
|
||||||
if local_dir is None:
|
if local_dir is None:
|
||||||
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
||||||
|
@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache
|
||||||
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
||||||
factory = OcrFactory()
|
factory = OcrFactory()
|
||||||
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
||||||
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
|||||||
return factory
|
return factory
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache
|
||||||
def get_picture_description_factory(
|
def get_picture_description_factory(
|
||||||
allow_external_plugins: bool = False,
|
allow_external_plugins: bool = False,
|
||||||
) -> PictureDescriptionFactory:
|
) -> PictureDescriptionFactory:
|
||||||
|
@ -1,18 +1,16 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
@ -1,16 +1,15 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
@ -41,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
|||||||
device = decide_device(accelerator_options.device)
|
device = decide_device(accelerator_options.device)
|
||||||
self.device = device
|
self.device = device
|
||||||
|
|
||||||
_log.debug("Available device for HuggingFace VLM: {}".format(device))
|
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
||||||
|
|
||||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Union
|
from typing import Optional
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel
|
from docling_core.types.doc import DocItemLabel
|
||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Tuple, Type
|
from typing import Optional, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
|
|||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
if "darwin" != sys.platform:
|
if "darwin" != sys.platform:
|
||||||
raise RuntimeError(f"OcrMac is only supported on Mac.")
|
raise RuntimeError("OcrMac is only supported on Mac.")
|
||||||
install_errmsg = (
|
install_errmsg = (
|
||||||
"ocrmac is not correctly installed. "
|
"ocrmac is not correctly installed. "
|
||||||
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Iterable, List
|
from collections.abc import Iterable
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
|
|||||||
sanitized_text = "".join(lines)
|
sanitized_text = "".join(lines)
|
||||||
|
|
||||||
# Text normalization
|
# Text normalization
|
||||||
sanitized_text = sanitized_text.replace("⁄", "/")
|
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
||||||
sanitized_text = sanitized_text.replace("’", "'")
|
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
||||||
sanitized_text = sanitized_text.replace("‘", "'")
|
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
||||||
sanitized_text = sanitized_text.replace("“", '"')
|
sanitized_text = sanitized_text.replace("“", '"')
|
||||||
sanitized_text = sanitized_text.replace("”", '"')
|
sanitized_text = sanitized_text.replace("”", '"')
|
||||||
sanitized_text = sanitized_text.replace("•", "·")
|
sanitized_text = sanitized_text.replace("•", "·")
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional
|
from typing import Optional
|
||||||
|
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type, Union
|
from typing import Optional, Type, Union
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
import logging
|
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable, List, Optional, Type, Union
|
from typing import List, Optional, Type, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
NodeItem,
|
NodeItem,
|
||||||
PictureClassificationClass,
|
|
||||||
PictureItem,
|
PictureItem,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type, Union
|
from typing import Optional, Type, Union
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type
|
from typing import Optional, Type
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
@ -1,12 +1,7 @@
|
|||||||
import copy
|
|
||||||
import random
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
|
||||||
CoordOrigin,
|
|
||||||
DocItem,
|
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
@ -17,13 +12,10 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import ContentLayer
|
from docling_core.types.doc.document import ContentLayer
|
||||||
from docling_core.types.legacy_doc.base import Ref
|
|
||||||
from docling_core.types.legacy_doc.document import BaseText
|
|
||||||
from docling_ibm_models.reading_order.reading_order_rb import (
|
from docling_ibm_models.reading_order.reading_order_rb import (
|
||||||
PageElement as ReadingOrderPageElement,
|
PageElement as ReadingOrderPageElement,
|
||||||
|
ReadingOrderPredictor,
|
||||||
)
|
)
|
||||||
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
|
||||||
from PIL import ImageDraw
|
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
|
|||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
import copy
|
import copy
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Union
|
from typing import Optional
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||||
from docling_core.types.doc.page import (
|
from docling_core.types.doc.page import (
|
||||||
BoundingRectangle,
|
BoundingRectangle,
|
||||||
SegmentedPdfPage,
|
|
||||||
TextCellUnit,
|
TextCellUnit,
|
||||||
)
|
)
|
||||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
|
@ -3,9 +3,10 @@ import io
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from subprocess import DEVNULL, PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import Iterable, List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type
|
from typing import Optional, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
@ -3,9 +3,10 @@ import logging
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Callable, Iterable, List
|
from collections.abc import Iterable
|
||||||
|
from typing import Any, Callable, List
|
||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import NodeItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
|
|
||||||
total_elapsed_time = 0.0
|
total_elapsed_time = 0.0
|
||||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||||
for i in range(0, conv_res.input.page_count):
|
for i in range(conv_res.input.page_count):
|
||||||
start_page, end_page = conv_res.input.limits.page_range
|
start_page, end_page = conv_res.input.limits.page_range
|
||||||
if (start_page - 1) <= i <= (end_page - 1):
|
if (start_page - 1) <= i <= (end_page - 1):
|
||||||
conv_res.pages.append(Page(page_no=i))
|
conv_res.pages.append(Page(page_no=i))
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, cast
|
from typing import Optional, cast
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import warnings
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union, cast
|
from typing import List, Optional, Union, cast
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
from collections.abc import Iterable
|
||||||
|
from typing import Any, Dict, List, Tuple, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import TextCell
|
|
||||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||||
|
|
||||||
from docling.datamodel.document import ConversionResult, Page
|
from docling.datamodel.document import ConversionResult, Page
|
||||||
|
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
|||||||
return unique_objects
|
return unique_objects
|
||||||
|
|
||||||
|
|
||||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
mimetype="application/pdf",
|
mimetype="application/pdf",
|
||||||
filename=doc_glm["file-info"]["filename"],
|
filename=doc_glm["file-info"]["filename"],
|
||||||
|
@ -18,7 +18,7 @@ class UnionFind:
|
|||||||
|
|
||||||
def __init__(self, elements):
|
def __init__(self, elements):
|
||||||
self.parent = {elem: elem for elem in elements}
|
self.parent = {elem: elem for elem in elements}
|
||||||
self.rank = {elem: 0 for elem in elements}
|
self.rank = dict.fromkeys(elements, 0)
|
||||||
|
|
||||||
def find(self, x):
|
def find(self, x):
|
||||||
if self.parent[x] != x:
|
if self.parent[x] != x:
|
||||||
|
@ -37,7 +37,7 @@ def download_models(
|
|||||||
output_dir.mkdir(exist_ok=True, parents=True)
|
output_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
if with_layout:
|
if with_layout:
|
||||||
_log.info(f"Downloading layout model...")
|
_log.info("Downloading layout model...")
|
||||||
LayoutModel.download_models(
|
LayoutModel.download_models(
|
||||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -45,7 +45,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_tableformer:
|
if with_tableformer:
|
||||||
_log.info(f"Downloading tableformer model...")
|
_log.info("Downloading tableformer model...")
|
||||||
TableStructureModel.download_models(
|
TableStructureModel.download_models(
|
||||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -53,7 +53,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_picture_classifier:
|
if with_picture_classifier:
|
||||||
_log.info(f"Downloading picture classifier model...")
|
_log.info("Downloading picture classifier model...")
|
||||||
DocumentPictureClassifier.download_models(
|
DocumentPictureClassifier.download_models(
|
||||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -61,7 +61,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_code_formula:
|
if with_code_formula:
|
||||||
_log.info(f"Downloading code formula model...")
|
_log.info("Downloading code formula model...")
|
||||||
CodeFormulaModel.download_models(
|
CodeFormulaModel.download_models(
|
||||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -69,7 +69,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_smolvlm:
|
if with_smolvlm:
|
||||||
_log.info(f"Downloading SmolVlm model...")
|
_log.info("Downloading SmolVlm model...")
|
||||||
PictureDescriptionVlmModel.download_models(
|
PictureDescriptionVlmModel.download_models(
|
||||||
repo_id=smolvlm_picture_description.repo_id,
|
repo_id=smolvlm_picture_description.repo_id,
|
||||||
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
||||||
@ -78,7 +78,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_granite_vision:
|
if with_granite_vision:
|
||||||
_log.info(f"Downloading Granite Vision model...")
|
_log.info("Downloading Granite Vision model...")
|
||||||
PictureDescriptionVlmModel.download_models(
|
PictureDescriptionVlmModel.download_models(
|
||||||
repo_id=granite_picture_description.repo_id,
|
repo_id=granite_picture_description.repo_id,
|
||||||
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
||||||
@ -87,7 +87,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_easyocr:
|
if with_easyocr:
|
||||||
_log.info(f"Downloading easyocr models...")
|
_log.info("Downloading easyocr models...")
|
||||||
EasyOcrModel.download_models(
|
EasyOcrModel.download_models(
|
||||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
|
@ -383,7 +383,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"print(f\"Downloading {url}...\")\n",
|
"print(f\"Downloading {url}...\")\n",
|
||||||
"buf = BytesIO(requests.get(url).content)\n",
|
"buf = BytesIO(requests.get(url).content)\n",
|
||||||
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||||
"with zipfile.ZipFile(buf) as zf:\n",
|
"with zipfile.ZipFile(buf) as zf:\n",
|
||||||
" res = zf.testzip()\n",
|
" res = zf.testzip()\n",
|
||||||
" if res:\n",
|
" if res:\n",
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
@ -3,7 +3,6 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
|
|||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.models.ocr_mac_model import OcrMacOptions
|
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -3,8 +3,8 @@
|
|||||||
# It does not run the actual formula understanding model.
|
# It does not run the actual formula understanding model.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
||||||
|
|
||||||
|
@ -3,8 +3,9 @@
|
|||||||
# It does not run the actual picture classifier model.
|
# It does not run the actual picture classifier model.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable
|
from typing import Any
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
@ -1,14 +1,9 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
|
||||||
OcrMacOptions,
|
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
RapidOcrOptions,
|
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
@ -153,10 +153,10 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"for i, chunk in enumerate(chunk_iter):\n",
|
"for i, chunk in enumerate(chunk_iter):\n",
|
||||||
" print(f\"=== {i} ===\")\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
|
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
||||||
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
|
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
@ -353,11 +353,11 @@
|
|||||||
"for i, chunk in enumerate(chunks):\n",
|
"for i, chunk in enumerate(chunks):\n",
|
||||||
" print(f\"=== {i} ===\")\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
||||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
||||||
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
||||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
|
@ -2,17 +2,11 @@ import json
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
granite_vision_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
smoldocling_vlm_mlx_conversion_options,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
|
|||||||
for source in sources:
|
for source in sources:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
print("================================================")
|
print("================================================")
|
||||||
print("Processing... {}".format(source))
|
print(f"Processing... {source}")
|
||||||
print("================================================")
|
print("================================================")
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
@ -77,7 +71,7 @@ for source in sources:
|
|||||||
print(page.predictions.vlm_response.text)
|
print(page.predictions.vlm_response.text)
|
||||||
|
|
||||||
res.document.save_as_html(
|
res.document.save_as_html(
|
||||||
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
|
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
|
||||||
image_mode=ImageRefMode.REFERENCED,
|
image_mode=ImageRefMode.REFERENCED,
|
||||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||||
)
|
)
|
||||||
|
@ -144,7 +144,7 @@
|
|||||||
"for pic in doc.pictures[:5]:\n",
|
"for pic in doc.pictures[:5]:\n",
|
||||||
" html_item = (\n",
|
" html_item = (\n",
|
||||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" for annotation in pic.annotations:\n",
|
" for annotation in pic.annotations:\n",
|
||||||
@ -252,7 +252,7 @@
|
|||||||
"for pic in doc.pictures[:5]:\n",
|
"for pic in doc.pictures[:5]:\n",
|
||||||
" html_item = (\n",
|
" html_item = (\n",
|
||||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" for annotation in pic.annotations:\n",
|
" for annotation in pic.annotations:\n",
|
||||||
|
@ -351,7 +351,7 @@
|
|||||||
"for source in sources:\n",
|
"for source in sources:\n",
|
||||||
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
||||||
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
|
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
|
||||||
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
|
" print(f\"- text: {doc_chunk.text!r}\")\n",
|
||||||
" if doc_chunk.meta.origin:\n",
|
" if doc_chunk.meta.origin:\n",
|
||||||
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
|
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
|
||||||
" if doc_chunk.meta.headings:\n",
|
" if doc_chunk.meta.headings:\n",
|
||||||
|
@ -119,7 +119,7 @@
|
|||||||
" device = torch.device(\"mps\")\n",
|
" device = torch.device(\"mps\")\n",
|
||||||
" print(\"MPS GPU is enabled.\")\n",
|
" print(\"MPS GPU is enabled.\")\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
" raise EnvironmentError(\n",
|
" raise OSError(\n",
|
||||||
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
||||||
" )"
|
" )"
|
||||||
]
|
]
|
||||||
@ -226,7 +226,6 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling.datamodel.document import ConversionResult\n",
|
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Instantiate the doc converter\n",
|
"# Instantiate the doc converter\n",
|
||||||
@ -345,7 +344,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
||||||
" if not openai_api_key:\n",
|
" if not openai_api_key:\n",
|
||||||
" raise EnvironmentError(\n",
|
" raise OSError(\n",
|
||||||
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
||||||
" \"Please define it before running this script.\"\n",
|
" \"Please define it before running this script.\"\n",
|
||||||
" )"
|
" )"
|
||||||
@ -387,7 +386,6 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import weaviate.classes.config as wc\n",
|
"import weaviate.classes.config as wc\n",
|
||||||
"from weaviate.classes.config import DataType, Property\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Define the collection name\n",
|
"# Define the collection name\n",
|
||||||
"collection_name = \"docling\"\n",
|
"collection_name = \"docling\"\n",
|
||||||
|
@ -25,7 +25,7 @@ def main():
|
|||||||
document = mdb.convert()
|
document = mdb.convert()
|
||||||
|
|
||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
print(f"Document {path} converted.\nSaved markdown output to: {str(out_path)}")
|
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
|
||||||
|
|
||||||
# Export Docling document format to markdowndoc:
|
# Export Docling document format to markdowndoc:
|
||||||
fn = os.path.basename(path)
|
fn = os.path.basename(path)
|
||||||
|
@ -1,13 +1,10 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TesseractCliOcrOptions,
|
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
@ -63,7 +63,7 @@ def main():
|
|||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
print(
|
print(
|
||||||
f"Document {res.input.file.name} converted."
|
f"Document {res.input.file.name} converted."
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
f"\nSaved markdown output to: {out_path!s}"
|
||||||
)
|
)
|
||||||
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||||
# Export Docling document format to markdowndoc:
|
# Export Docling document format to markdowndoc:
|
||||||
|
@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
|
|||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
@ -2,9 +2,9 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
|
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
|
|||||||
ApiVlmOptions,
|
ApiVlmOptions,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
granite_vision_vlm_ollama_conversion_options,
|
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
@ -202,12 +202,16 @@ select = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
ignore = [
|
ignore = [
|
||||||
|
"C408", # Unnecessary `dict()` call (rewrite as a literal)
|
||||||
"E501", # Line too long, handled by ruff formatter
|
"E501", # Line too long, handled by ruff formatter
|
||||||
"D107", # "Missing docstring in __init__",
|
"D107", # "Missing docstring in __init__",
|
||||||
|
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
|
||||||
"F811", # "redefinition of the same function"
|
"F811", # "redefinition of the same function"
|
||||||
"PL", # Pylint
|
"PL", # Pylint
|
||||||
"RUF012", # Mutable Class Attributes
|
"RUF012", # Mutable Class Attributes
|
||||||
|
"UP006", # List vs list, etc
|
||||||
"UP007", # Option and Union
|
"UP007", # Option and Union
|
||||||
|
"UP035", # `typing.Set` is deprecated, use `set` instead"
|
||||||
]
|
]
|
||||||
|
|
||||||
#extend-select = []
|
#extend-select = []
|
||||||
@ -217,7 +221,7 @@ ignore = [
|
|||||||
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
||||||
|
|
||||||
[tool.ruff.lint.mccabe]
|
[tool.ruff.lint.mccabe]
|
||||||
max-complexity = 15
|
max-complexity = 20
|
||||||
|
|
||||||
# [tool.ruff.lint.isort.sections]
|
# [tool.ruff.lint.isort.sections]
|
||||||
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
|
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
|
||||||
|
@ -37,7 +37,7 @@ def test_asciidocs_examples():
|
|||||||
print("\n\n", pred_mddoc)
|
print("\n\n", pred_mddoc)
|
||||||
|
|
||||||
if os.path.exists(gname):
|
if os.path.exists(gname):
|
||||||
with open(gname, "r") as fr:
|
with open(gname) as fr:
|
||||||
true_mddoc = fr.read()
|
true_mddoc = fr.read()
|
||||||
|
|
||||||
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from pytest import warns
|
from pytest import warns
|
||||||
@ -16,7 +14,7 @@ GENERATE = GEN_TEST_DATA
|
|||||||
|
|
||||||
def get_csv_paths():
|
def get_csv_paths():
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path(f"./tests/data/csv/")
|
directory = Path("./tests/data/csv/")
|
||||||
|
|
||||||
# List all CSV files in the directory and its subdirectories
|
# List all CSV files in the directory and its subdirectories
|
||||||
return sorted(directory.rglob("*.csv"))
|
return sorted(directory.rglob("*.csv"))
|
||||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
||||||
|
@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
|
|||||||
|
|
||||||
|
|
||||||
def get_pubmed_paths():
|
def get_pubmed_paths():
|
||||||
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
|
directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
|
||||||
xml_files = sorted(directory.rglob("*.xml"))
|
xml_files = sorted(directory.rglob("*.xml"))
|
||||||
return xml_files
|
return xml_files
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
|
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
|
|||||||
assert isinstance(texts[2], TextItem)
|
assert isinstance(texts[2], TextItem)
|
||||||
assert texts[2].text == (
|
assert texts[2].text == (
|
||||||
"An interleaver receives incoming data frames of size N. The interleaver "
|
"An interleaver receives incoming data frames of size N. The interleaver "
|
||||||
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
|
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
|
||||||
"then effectively rearranges (permutes) the data by permuting the rows of the "
|
"then effectively rearranges (permutes) the data by permuting the rows of the "
|
||||||
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
|
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
|
||||||
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
|
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
|
||||||
"equal to N₂, βj is a constant which may be different for each row, and each "
|
"equal to N₂, βj is a constant which may be different for each row, and each "
|
||||||
"αj is a relative prime number relative to P. After permuting, the "
|
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
|
||||||
"interleaver outputs the data in a different order than received (e.g., "
|
"interleaver outputs the data in a different order than received (e.g., "
|
||||||
"receives sequentially row by row, outputs sequentially each column by column)."
|
"receives sequentially row by row, outputs sequentially each column by column)."
|
||||||
)
|
)
|
||||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
|||||||
from docling_core.types.doc import CodeItem, TextItem
|
from docling_core.types.doc import CodeItem, TextItem
|
||||||
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
@ -2,7 +2,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling_core.types.doc import PictureClassificationData
|
from docling_core.types.doc import PictureClassificationData
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
||||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
|
@ -4,7 +4,6 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
@ -3,8 +3,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
@ -216,7 +216,7 @@ def verify_picture_image_v2(
|
|||||||
|
|
||||||
|
|
||||||
def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool):
|
def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool):
|
||||||
assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match."
|
assert len(doc_pred.texts) == len(doc_true.texts), "Text lengths do not match."
|
||||||
|
|
||||||
assert len(doc_true.tables) == len(doc_pred.tables), (
|
assert len(doc_true.tables) == len(doc_pred.tables), (
|
||||||
"document has different count of tables than expected."
|
"document has different count of tables than expected."
|
||||||
@ -230,7 +230,7 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
|
|||||||
assert isinstance(pred_item, DocItem), "Test item is not a DocItem"
|
assert isinstance(pred_item, DocItem), "Test item is not a DocItem"
|
||||||
|
|
||||||
# Validate type
|
# Validate type
|
||||||
assert true_item.label == pred_item.label, f"Object label does not match."
|
assert true_item.label == pred_item.label, "Object label does not match."
|
||||||
|
|
||||||
# Validate provenance
|
# Validate provenance
|
||||||
assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch"
|
assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch"
|
||||||
@ -337,16 +337,16 @@ def verify_conversion_result_v1(
|
|||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, "w") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
with open(pages_path, "r") as fr:
|
with open(pages_path) as fr:
|
||||||
doc_true_pages = PageList.validate_json(fr.read())
|
doc_true_pages = PageList.validate_json(fr.read())
|
||||||
|
|
||||||
with open(json_path, "r") as fr:
|
with open(json_path) as fr:
|
||||||
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
with open(md_path, "r") as fr:
|
with open(md_path) as fr:
|
||||||
doc_true_md = fr.read()
|
doc_true_md = fr.read()
|
||||||
|
|
||||||
with open(dt_path, "r") as fr:
|
with open(dt_path) as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
if not fuzzy:
|
if not fuzzy:
|
||||||
@ -419,16 +419,16 @@ def verify_conversion_result_v2(
|
|||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, "w") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
with open(pages_path, "r") as fr:
|
with open(pages_path) as fr:
|
||||||
doc_true_pages = PageList.validate_json(fr.read())
|
doc_true_pages = PageList.validate_json(fr.read())
|
||||||
|
|
||||||
with open(json_path, "r") as fr:
|
with open(json_path) as fr:
|
||||||
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
with open(md_path, "r") as fr:
|
with open(md_path) as fr:
|
||||||
doc_true_md = fr.read()
|
doc_true_md = fr.read()
|
||||||
|
|
||||||
with open(dt_path, "r") as fr:
|
with open(dt_path) as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
if not fuzzy:
|
if not fuzzy:
|
||||||
|
Loading…
Reference in New Issue
Block a user