Merge remote-tracking branch 'origin/main' into fix-numpy-pinning

This commit is contained in:
Michele Dolfi 2024-12-03 10:51:10 +01:00
commit e9c6462629
6 changed files with 103 additions and 100 deletions

View File

@ -2,6 +2,7 @@ import importlib
import json
import logging
import re
import tempfile
import time
import warnings
from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -256,95 +257,98 @@ def convert(
if from_formats is None:
from_formats = [e for e in InputFormat]
input_doc_paths: List[Path] = []
for src in input_sources:
source = resolve_file_source(source=src)
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
)
raise typer.Abort()
elif source.is_dir():
for fmt in from_formats:
for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = []
for src in input_sources:
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
)
raise typer.Abort()
elif source.is_dir():
for fmt in from_formats:
for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
else:
input_doc_paths.append(source)
if to_formats is None:
to_formats = [OutputFormat.MARKDOWN]
export_json = OutputFormat.JSON in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
input_doc_paths.append(source)
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
if to_formats is None:
to_formats = [OutputFormat.MARKDOWN]
ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list
export_json = OutputFormat.JSON in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list
pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
if pdf_backend == PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
)
pipeline_options.table_structure_options.do_cell_matching = (
True # do_cell_matching
)
pipeline_options.table_structure_options.mode = table_mode
start_time = time.time()
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
if pdf_backend == PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
)
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
)
end_time = time.time() - start_time
start_time = time.time()
conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")

View File

@ -1,5 +1,4 @@
from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import (
@ -9,6 +8,9 @@ from docling_core.types.doc import (
Size,
TableCell,
)
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
@ -207,10 +209,3 @@ class Page(BaseModel):
@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
name: str
stream: BytesIO

View File

@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel
from typing_extensions import deprecated
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj)
if format not in format_options.keys():
_log.info(

View File

@ -1,5 +1,6 @@
import io
import logging
import os
import tempfile
from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Optional, Tuple
@ -130,14 +131,17 @@ class TesseractOcrCliModel(BaseOcrModel):
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
with tempfile.NamedTemporaryFile(
suffix=".png", mode="w"
) as image_file:
fname = image_file.name
high_res_image.save(fname)
try:
with tempfile.NamedTemporaryFile(
suffix=".png", mode="w+b", delete=False
) as image_file:
fname = image_file.name
high_res_image.save(image_file)
df = self._run_tesseract(fname)
finally:
if os.path.exists(fname):
os.remove(fname)
# _log.info(df)

2
poetry.lock generated
View File

@ -7647,4 +7647,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "cbcb4f196d6d2631ce82af90af2d356c557c9dcd2c12bb7ee193043962ba729f"
content-hash = "33ee730cf750e618ec005ad44ad09617bc8f95632b30ac02b5290a03a33bdf5b"

View File

@ -26,7 +26,7 @@ packages = [{include = "docling"}]
######################
python = "^3.9"
pydantic = ">=2.0.0,<2.10"
docling-core = "^2.5.1"
docling-core = "^2.6.1"
docling-ibm-models = "^2.0.6"
deepsearch-glm = "^0.26.1"
filetype = "^1.2.0"