mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Merge remote-tracking branch 'origin/main' into fix-numpy-pinning
This commit is contained in:
commit
e9c6462629
@ -2,6 +2,7 @@ import importlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
@ -9,7 +10,7 @@ from pathlib import Path
|
||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
@ -256,9 +257,10 @@ def convert(
|
||||
if from_formats is None:
|
||||
from_formats = [e for e in InputFormat]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_file_source(source=src)
|
||||
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
@ -302,7 +304,9 @@ def convert(
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.do_cell_matching = (
|
||||
True # do_cell_matching
|
||||
)
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
if artifacts_path is not None:
|
||||
|
@ -1,5 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
@ -207,10 +209,3 @@ class Page(BaseModel):
|
||||
@property
|
||||
def image(self) -> Optional[Image]:
|
||||
return self.get_image(scale=self._default_image_scale)
|
||||
|
||||
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
|
@ -1,5 +1,6 @@
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import Iterable, Optional, Tuple
|
||||
@ -130,14 +131,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w"
|
||||
suffix=".png", mode="w+b", delete=False
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
high_res_image.save(image_file)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.remove(fname)
|
||||
|
||||
# _log.info(df)
|
||||
|
||||
|
2
poetry.lock
generated
2
poetry.lock
generated
@ -7647,4 +7647,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "cbcb4f196d6d2631ce82af90af2d356c557c9dcd2c12bb7ee193043962ba729f"
|
||||
content-hash = "33ee730cf750e618ec005ad44ad09617bc8f95632b30ac02b5290a03a33bdf5b"
|
||||
|
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
||||
######################
|
||||
python = "^3.9"
|
||||
pydantic = ">=2.0.0,<2.10"
|
||||
docling-core = "^2.5.1"
|
||||
docling-core = "^2.6.1"
|
||||
docling-ibm-models = "^2.0.6"
|
||||
deepsearch-glm = "^0.26.1"
|
||||
filetype = "^1.2.0"
|
||||
|
Loading…
Reference in New Issue
Block a user