mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Merge remote-tracking branch 'origin/main' into fix-numpy-pinning
This commit is contained in:
commit
e9c6462629
@ -2,6 +2,7 @@ import importlib
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@ -9,7 +10,7 @@ from pathlib import Path
|
|||||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
@ -256,9 +257,10 @@ def convert(
|
|||||||
if from_formats is None:
|
if from_formats is None:
|
||||||
from_formats = [e for e in InputFormat]
|
from_formats = [e for e in InputFormat]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
input_doc_paths: List[Path] = []
|
input_doc_paths: List[Path] = []
|
||||||
for src in input_sources:
|
for src in input_sources:
|
||||||
source = resolve_file_source(source=src)
|
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
||||||
if not source.exists():
|
if not source.exists():
|
||||||
err_console.print(
|
err_console.print(
|
||||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||||
@ -302,7 +304,9 @@ def convert(
|
|||||||
ocr_options=ocr_options,
|
ocr_options=ocr_options,
|
||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = (
|
||||||
|
True # do_cell_matching
|
||||||
|
)
|
||||||
pipeline_options.table_structure_options.mode = table_mode
|
pipeline_options.table_structure_options.mode = table_mode
|
||||||
|
|
||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from io import BytesIO
|
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
|||||||
Size,
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||||
|
DocumentStream,
|
||||||
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
@ -207,10 +209,3 @@ class Page(BaseModel):
|
|||||||
@property
|
@property
|
||||||
def image(self) -> Optional[Image]:
|
def image(self) -> Optional[Image]:
|
||||||
return self.get_image(scale=self._default_image_scale)
|
return self.get_image(scale=self._default_image_scale)
|
||||||
|
|
||||||
|
|
||||||
class DocumentStream(BaseModel):
|
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
||||||
|
|
||||||
name: str
|
|
||||||
stream: BytesIO
|
|
||||||
|
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
|||||||
)
|
)
|
||||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
for item in self.path_or_stream_iterator:
|
for item in self.path_or_stream_iterator:
|
||||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
_log.info(
|
_log.info(
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from subprocess import DEVNULL, PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import Iterable, Optional, Tuple
|
from typing import Iterable, Optional, Tuple
|
||||||
@ -130,14 +131,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
with tempfile.NamedTemporaryFile(
|
with tempfile.NamedTemporaryFile(
|
||||||
suffix=".png", mode="w"
|
suffix=".png", mode="w+b", delete=False
|
||||||
) as image_file:
|
) as image_file:
|
||||||
fname = image_file.name
|
fname = image_file.name
|
||||||
high_res_image.save(fname)
|
high_res_image.save(image_file)
|
||||||
|
|
||||||
df = self._run_tesseract(fname)
|
df = self._run_tesseract(fname)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(fname):
|
||||||
|
os.remove(fname)
|
||||||
|
|
||||||
# _log.info(df)
|
# _log.info(df)
|
||||||
|
|
||||||
|
2
poetry.lock
generated
2
poetry.lock
generated
@ -7647,4 +7647,4 @@ tesserocr = ["tesserocr"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "cbcb4f196d6d2631ce82af90af2d356c557c9dcd2c12bb7ee193043962ba729f"
|
content-hash = "33ee730cf750e618ec005ad44ad09617bc8f95632b30ac02b5290a03a33bdf5b"
|
||||||
|
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
|||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pydantic = ">=2.0.0,<2.10"
|
pydantic = ">=2.0.0,<2.10"
|
||||||
docling-core = "^2.5.1"
|
docling-core = "^2.6.1"
|
||||||
docling-ibm-models = "^2.0.6"
|
docling-ibm-models = "^2.0.6"
|
||||||
deepsearch-glm = "^0.26.1"
|
deepsearch-glm = "^0.26.1"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
|
Loading…
Reference in New Issue
Block a user