mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
Merge branch 'DS4SD:main' into simonas/base-options
This commit is contained in:
commit
1c14a2ac56
18
CHANGELOG.md
18
CHANGELOG.md
@ -1,3 +1,21 @@
|
||||
## [v2.8.2](https://github.com/DS4SD/docling/releases/tag/v2.8.2) - 2024-12-03
|
||||
|
||||
### Fix
|
||||
|
||||
* ParserError EOF inside string (#470) ([#472](https://github.com/DS4SD/docling/issues/472)) ([`c90c41c`](https://github.com/DS4SD/docling/commit/c90c41c391de4366db554d7a71ce9a35467c981e))
|
||||
* PermissionError when using tesseract_ocr_cli_model ([#496](https://github.com/DS4SD/docling/issues/496)) ([`d3f84b2`](https://github.com/DS4SD/docling/commit/d3f84b2457125feacd0c21d6513e7ae69a308ea5))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Add styling for faq ([#502](https://github.com/DS4SD/docling/issues/502)) ([`5ba3807`](https://github.com/DS4SD/docling/commit/5ba3807f315a01b1a4e8df9bab40e34a4238205a))
|
||||
* Typo in faq ([#484](https://github.com/DS4SD/docling/issues/484)) ([`33cff98`](https://github.com/DS4SD/docling/commit/33cff98d360c02a382a66850c696a0cf511659ac))
|
||||
* Add automatic api reference ([#475](https://github.com/DS4SD/docling/issues/475)) ([`d487210`](https://github.com/DS4SD/docling/commit/d4872103b8f24e38b37a8cd3ac414d3e02e7d6e8))
|
||||
* Introduce faq section ([#468](https://github.com/DS4SD/docling/issues/468)) ([`8ccb3c6`](https://github.com/DS4SD/docling/commit/8ccb3c6db69318789af7deec26cfa2a3fd71302e))
|
||||
|
||||
### Performance
|
||||
|
||||
* Prevent temp file leftovers, reuse core type ([#487](https://github.com/DS4SD/docling/issues/487)) ([`051789d`](https://github.com/DS4SD/docling/commit/051789d01706d3823dd6307eca4dc5faacd1b7ce))
|
||||
|
||||
## [v2.8.1](https://github.com/DS4SD/docling/releases/tag/v2.8.1) - 2024-11-29
|
||||
|
||||
### Fix
|
||||
|
@ -2,6 +2,7 @@ import importlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
@ -9,7 +10,7 @@ from pathlib import Path
|
||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
@ -256,95 +257,98 @@ def convert(
|
||||
if from_formats is None:
|
||||
from_formats = [e for e in InputFormat]
|
||||
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_file_source(source=src)
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions[fmt]:
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions[fmt]:
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
if ocr_engine == OcrEngine.EASYOCR:
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.RAPIDOCR:
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
if ocr_engine == OcrEngine.EASYOCR:
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.RAPIDOCR:
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = (
|
||||
True # do_cell_matching
|
||||
)
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
start_time = time.time()
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
conv_results,
|
||||
output_dir=output,
|
||||
export_json=export_json,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
)
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
conv_results,
|
||||
output_dir=output,
|
||||
export_json=export_json,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
|
||||
FAILURE = auto()
|
||||
SUCCESS = auto()
|
||||
PARTIAL_SUCCESS = auto()
|
||||
SKIPPED = auto()
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
|
||||
DOCUMENT_BACKEND = auto()
|
||||
MODEL = auto()
|
||||
DOC_ASSEMBLER = auto()
|
||||
USER_INPUT = auto()
|
||||
|
||||
|
||||
class ErrorItem(BaseModel):
|
||||
@ -207,10 +211,3 @@ class Page(BaseModel):
|
||||
@property
|
||||
def image(self) -> Optional[Image]:
|
||||
return self.get_image(scale=self._default_image_scale)
|
||||
|
||||
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
@ -3,7 +3,7 @@ import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
||||
|
||||
import filetype
|
||||
from docling_core.types.doc import (
|
||||
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@ -164,12 +164,6 @@ class InputDocument(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
if not self._backend.is_valid():
|
||||
self.valid = False
|
||||
@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
|
||||
return ds_doc
|
||||
|
||||
|
||||
class _DummyBackend(AbstractDocumentBackend):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return set()
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
return super().unload()
|
||||
|
||||
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
@ -459,13 +472,14 @@ class _DocumentConversionInput(BaseModel):
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||
_log.error(
|
||||
f"Input document {obj.name} does not match any allowed format."
|
||||
)
|
||||
continue
|
||||
backend = _DummyBackend
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
|
||||
|
@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
@ -23,6 +29,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.exceptions import ConversionError
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
_format_to_default_options = {
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.MD: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||
),
|
||||
InputFormat.ASCIIDOC: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
format_to_default_options = {
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.MD: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||
),
|
||||
InputFormat.ASCIIDOC: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
else:
|
||||
raise RuntimeError(f"No default options configured for {format}")
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
@ -121,36 +133,26 @@ class DocumentConverter:
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||
):
|
||||
self.allowed_formats = allowed_formats
|
||||
self.format_to_options = format_options
|
||||
|
||||
if self.allowed_formats is None:
|
||||
# if self.format_to_options is not None:
|
||||
# self.allowed_formats = self.format_to_options.keys()
|
||||
# else:
|
||||
self.allowed_formats = [e for e in InputFormat] # all formats
|
||||
|
||||
if self.format_to_options is None:
|
||||
self.format_to_options = _format_to_default_options
|
||||
else:
|
||||
for f in self.allowed_formats:
|
||||
if f not in self.format_to_options.keys():
|
||||
_log.debug(f"Requested format {f} will use default options.")
|
||||
self.format_to_options[f] = _format_to_default_options[f]
|
||||
|
||||
remove_keys = []
|
||||
for f in self.format_to_options.keys():
|
||||
if f not in self.allowed_formats:
|
||||
remove_keys.append(f)
|
||||
|
||||
for f in remove_keys:
|
||||
self.format_to_options.pop(f)
|
||||
|
||||
self.allowed_formats = (
|
||||
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
||||
)
|
||||
self.format_to_options = {
|
||||
format: (
|
||||
_get_default_option(format=format)
|
||||
if (custom_option := (format_options or {}).get(format)) is None
|
||||
else custom_option
|
||||
)
|
||||
for format in self.allowed_formats
|
||||
}
|
||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||
|
||||
def initialize_pipeline(self, format: InputFormat):
|
||||
"""Initialize the conversion pipeline for the selected format."""
|
||||
self._get_pipeline(doc_format=format)
|
||||
pipeline = self._get_pipeline(doc_format=format)
|
||||
if pipeline is None:
|
||||
raise ConversionError(
|
||||
f"No pipeline could be initialized for format {format}"
|
||||
)
|
||||
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert(
|
||||
@ -186,22 +188,28 @@ class DocumentConverter:
|
||||
limits=limits,
|
||||
)
|
||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||
|
||||
had_result = False
|
||||
for conv_res in conv_res_iter:
|
||||
had_result = True
|
||||
if raises_on_error and conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise RuntimeError(
|
||||
raise ConversionError(
|
||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||
)
|
||||
else:
|
||||
yield conv_res
|
||||
|
||||
if not had_result and raises_on_error:
|
||||
raise ConversionError(
|
||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
)
|
||||
|
||||
def _convert(
|
||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||
) -> Iterator[ConversionResult]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
start_time = time.monotonic()
|
||||
|
||||
for input_batch in chunkify(
|
||||
@ -223,27 +231,22 @@ class DocumentConverter:
|
||||
):
|
||||
elapsed = time.monotonic() - start_time
|
||||
start_time = time.monotonic()
|
||||
|
||||
if item is not None:
|
||||
_log.info(
|
||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||
)
|
||||
yield item
|
||||
else:
|
||||
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
||||
_log.info(
|
||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||
)
|
||||
yield item
|
||||
|
||||
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
fopt = self.format_to_options.get(doc_format)
|
||||
|
||||
if fopt is None:
|
||||
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
||||
return None
|
||||
else:
|
||||
pipeline_class = fopt.pipeline_cls
|
||||
pipeline_options = fopt.pipeline_options
|
||||
|
||||
assert pipeline_options is not None
|
||||
if pipeline_options is None:
|
||||
return None
|
||||
# TODO this will ignore if different options have been defined for the same pipeline class.
|
||||
if (
|
||||
pipeline_class not in self.initialized_pipelines
|
||||
@ -257,11 +260,26 @@ class DocumentConverter:
|
||||
|
||||
def _process_document(
|
||||
self, in_doc: InputDocument, raises_on_error: bool
|
||||
) -> Optional[ConversionResult]:
|
||||
assert self.allowed_formats is not None
|
||||
assert in_doc.format in self.allowed_formats
|
||||
) -> ConversionResult:
|
||||
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
valid = (
|
||||
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
||||
)
|
||||
if valid:
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
else:
|
||||
error_message = f"File format not allowed: {in_doc.file}"
|
||||
if raises_on_error:
|
||||
raise ConversionError(error_message)
|
||||
else:
|
||||
error_item = ErrorItem(
|
||||
component_type=DoclingComponentType.USER_INPUT,
|
||||
module_name="",
|
||||
error_message=error_message,
|
||||
)
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
@ -270,26 +288,28 @@ class DocumentConverter:
|
||||
) -> ConversionResult:
|
||||
if in_doc.valid:
|
||||
pipeline = self._get_pipeline(in_doc.format)
|
||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||
if pipeline is not None:
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
else:
|
||||
if raises_on_error:
|
||||
raise RuntimeError(
|
||||
raise ConversionError(
|
||||
f"No pipeline could be initialized for {in_doc.file}."
|
||||
)
|
||||
else:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc,
|
||||
status=ConversionStatus.FAILURE,
|
||||
)
|
||||
else:
|
||||
if raises_on_error:
|
||||
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
||||
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
||||
|
||||
else:
|
||||
# invalid doc or not of desired format
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc,
|
||||
status=ConversionStatus.FAILURE,
|
||||
)
|
||||
# TODO add error log why it failed.
|
||||
|
||||
return conv_res
|
||||
|
6
docling/exceptions.py
Normal file
6
docling/exceptions.py
Normal file
@ -0,0 +1,6 @@
|
||||
class BaseError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class ConversionError(BaseError):
|
||||
pass
|
@ -1,5 +1,7 @@
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import Iterable, Optional, Tuple
|
||||
@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
||||
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
||||
|
||||
# Display the dataframe (optional)
|
||||
# _log.info("df: ", df.head())
|
||||
@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w"
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w+b", delete=False
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(image_file)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.remove(fname)
|
||||
|
||||
# _log.info(df)
|
||||
|
||||
|
209
docs/faq.md
209
docs/faq.md
@ -3,132 +3,145 @@
|
||||
This is a collection of FAQ collected from the user questions on <https://github.com/DS4SD/docling/discussions>.
|
||||
|
||||
|
||||
### Python 3.13 support
|
||||
??? question "Is Python 3.13 supported?"
|
||||
|
||||
Full support for Python 3.13 is currently waiting for [pytorch](https://github.com/pytorch/pytorch).
|
||||
### Is Python 3.13 supported?
|
||||
|
||||
At the moment, no release has full support, but nightly builds are available. Docling was tested on Python 3.13 with the following steps:
|
||||
Full support for Python 3.13 is currently waiting for [pytorch](https://github.com/pytorch/pytorch).
|
||||
|
||||
```sh
|
||||
# Create a python 3.13 virtualenv
|
||||
python3.13 -m venv venv
|
||||
source ./venv/bin/activate
|
||||
At the moment, no release has full support, but nightly builds are available. Docling was tested on Python 3.13 with the following steps:
|
||||
|
||||
# Install torch nightly builds, see https://pytorch.org/
|
||||
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
```sh
|
||||
# Create a python 3.13 virtualenv
|
||||
python3.13 -m venv venv
|
||||
source ./venv/bin/activate
|
||||
|
||||
# Install docling
|
||||
pip3 install docling
|
||||
# Install torch nightly builds, see https://pytorch.org/
|
||||
pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
|
||||
# Run docling
|
||||
docling --no-ocr https://arxiv.org/pdf/2408.09869
|
||||
```
|
||||
# Install docling
|
||||
pip3 install docling
|
||||
|
||||
_Note: we are disabling OCR since easyocr and the nightly torch builds have some conflicts._
|
||||
# Run docling
|
||||
docling --no-ocr https://arxiv.org/pdf/2408.09869
|
||||
```
|
||||
|
||||
Source: Issue [#136](https://github.com/DS4SD/docling/issues/136)
|
||||
_Note: we are disabling OCR since easyocr and the nightly torch builds have some conflicts._
|
||||
|
||||
Source: Issue [#136](https://github.com/DS4SD/docling/issues/136)
|
||||
|
||||
|
||||
### Install conflicts with numpy (python 3.13)
|
||||
??? question "Install conflicts with numpy (python 3.13)"
|
||||
|
||||
### Install conflicts with numpy (python 3.13)
|
||||
|
||||
When using `docling-ibm-models>=2.0.7` and `deepsearch-glm>=0.26.2` these issues should not show up anymore.
|
||||
Docling supports numpy versions `>=1.24.4,<3.0.0` which should match all usages.
|
||||
|
||||
**For older versions**
|
||||
|
||||
This has been observed installing docling and langchain via poetry.
|
||||
|
||||
```
|
||||
...
|
||||
Thus, docling (>=2.7.0,<3.0.0) requires numpy (>=1.26.4,<2.0.0).
|
||||
So, because ... depends on both numpy (>=2.0.2,<3.0.0) and docling (^2.7.0), version solving failed.
|
||||
```
|
||||
|
||||
Numpy is only adding Python 3.13 support starting in some 2.x.y version. In order to prepare for 3.13, Docling depends on a 2.x.y for 3.13, otherwise depending an 1.x.y version. If you are allowing 3.13 in your pyproject.toml, Poetry will try to find some way to reconcile Docling's numpy version for 3.13 (some 2.x.y) with LangChain's version for that (some 1.x.y) — leading to the error above.
|
||||
|
||||
Check if Python 3.13 is among the Python versions allowed by your pyproject.toml and if so, remove it and try again.
|
||||
E.g., if you have python = "^3.10", use python = ">=3.10,<3.13" instead.
|
||||
|
||||
If you want to retain compatibility with python 3.9-3.13, you can also use a selector in pyproject.toml similar to the following
|
||||
|
||||
```toml
|
||||
numpy = [
|
||||
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
|
||||
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
|
||||
]
|
||||
```
|
||||
|
||||
Source: Issue [#283](https://github.com/DS4SD/docling/issues/283#issuecomment-2465035868)
|
||||
|
||||
|
||||
This has been observed installing docling and langchain via poetry.
|
||||
??? question "Are text styles (bold, underline, etc) supported?"
|
||||
|
||||
```
|
||||
...
|
||||
Thus, docling (>=2.7.0,<3.0.0) requires numpy (>=1.26.4,<2.0.0).
|
||||
So, because ... depends on both numpy (>=2.0.2,<3.0.0) and docling (^2.7.0), version solving failed.
|
||||
```
|
||||
### Are text styles (bold, underline, etc) supported?
|
||||
|
||||
Numpy is only adding Python 3.13 support starting in some 2.x.y version. In order to prepare for 3.13, Docling depends on a 2.x.y for 3.13, otherwise depending an 1.x.y version. If you are allowing 3.13 in your pyproject.toml, Poetry will try to find some way to reconcile Docling's numpy version for 3.13 (some 2.x.y) with LangChain's version for that (some 1.x.y) — leading to the error above.
|
||||
Currently text styles are not supported in the `DoclingDocument` format.
|
||||
If you are interest in contributing this feature, please open a discussion topic to brainstorm on the design.
|
||||
|
||||
Check if Python 3.13 is among the Python versions allowed by your pyproject.toml and if so, remove it and try again.
|
||||
E.g., if you have python = "^3.10", use python = ">=3.10,<3.13" instead.
|
||||
|
||||
If you want to retain compatibility with python 3.9-3.13, you can also use a selector in pyproject.toml similar to the following
|
||||
|
||||
```toml
|
||||
numpy = [
|
||||
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
|
||||
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
|
||||
]
|
||||
```
|
||||
_Note: this is not a simple topic_
|
||||
|
||||
|
||||
Source: Issue [#283](https://github.com/DS4SD/docling/issues/283#issuecomment-2465035868)
|
||||
??? question "How do I run completely offline?"
|
||||
|
||||
### How do I run completely offline?
|
||||
|
||||
Docling is not using any remote service, hence it can run in completely isolated air-gapped environments.
|
||||
|
||||
The only requirement is pointing the Docling runtime to the location where the model artifacts have been stored.
|
||||
|
||||
For example
|
||||
|
||||
```py
|
||||
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path="your location")
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
Source: Issue [#326](https://github.com/DS4SD/docling/issues/326)
|
||||
|
||||
|
||||
### GPU support
|
||||
??? question " Which model weights are needed to run Docling?"
|
||||
### Which model weights are needed to run Docling?
|
||||
|
||||
TBA
|
||||
Model weights are needed for the AI models used in the PDF pipeline. Other document types (docx, pptx, etc) do not have any such requirement.
|
||||
|
||||
For processing PDF documents, Docling requires the model weights from <https://huggingface.co/ds4sd/docling-models>.
|
||||
|
||||
When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/DS4SD/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior.
|
||||
|
||||
|
||||
### Text styles (bold, underline, etc)
|
||||
??? question "SSL error downloading model weights"
|
||||
|
||||
TBA
|
||||
### SSL error downloading model weights
|
||||
|
||||
```
|
||||
URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>
|
||||
```
|
||||
|
||||
Similar SSL download errors have been observed by some users. This happens when model weights are fetched from Hugging Face.
|
||||
The error could happen when the python environment doesn't have an up-to-date list of trusted certificates.
|
||||
|
||||
Possible solutions were
|
||||
|
||||
- Update to the latest version of [certifi](https://pypi.org/project/certifi/), i.e. `pip install --upgrade certifi`
|
||||
- Use [pip-system-certs](https://pypi.org/project/pip-system-certs/) to use the latest trusted certificates on your system.
|
||||
|
||||
|
||||
### How do I run completely offline?
|
||||
??? question "Which OCR languages are supported?"
|
||||
|
||||
Docling is not using any remote service, hence it can run in completely isolated air-gapped environments.
|
||||
### Which OCR languages are supported?
|
||||
|
||||
The only requirement is pointing the Docling runtime to the location where the model artifacts have been stored.
|
||||
Docling supports multiple OCR engine, each one has its own list of supported languages.
|
||||
Here is a collection of links to the original OCR engine's documentation listing the OCR languages.
|
||||
|
||||
For example
|
||||
- [EasyOCR](https://www.jaided.ai/easyocr/)
|
||||
- [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
|
||||
- [RapidOCR](https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/)
|
||||
- [Mac OCR](https://github.com/straussmaximilian/ocrmac/tree/main?tab=readme-ov-file#example-select-language-preference)
|
||||
|
||||
```py
|
||||
Setting the OCR language in Docling is done via the OCR pipeline options:
|
||||
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path="your location")
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
```
|
||||
```py
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
||||
Source: Issue [#326](https://github.com/DS4SD/docling/issues/326)
|
||||
|
||||
|
||||
### Which model weights are needed to run Docling?
|
||||
|
||||
Model weights are needed for the AI models used in the PDF pipeline. Other document types (docx, pptx, etc) do not have any such requirement.
|
||||
|
||||
For processing PDF documents, Docling requires the model weights from <https://huggingface.co/ds4sd/docling-models>.
|
||||
|
||||
When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/DS4SD/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior.
|
||||
|
||||
|
||||
|
||||
### SSL error downloading model weights
|
||||
|
||||
```
|
||||
URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>
|
||||
```
|
||||
|
||||
Similar SSL download errors have been observed by some users. This happens when model weights are fetched from Hugging Face.
|
||||
The error could happen when the python environment doesn't have an up-to-date list of trusted certificates.
|
||||
|
||||
Possible solutions were
|
||||
|
||||
- Update to the latest version of [certifi](https://pypi.org/project/certifi/), i.e. `pip install --upgrade certifi`
|
||||
- Use [pip-system-certs](https://pypi.org/project/pip-system-certs/) to use the latest trusted certificates on your system.
|
||||
|
||||
|
||||
### Which OCR languages are supported?
|
||||
|
||||
Docling supports multiple OCR engine, each one has its own list of supported languages.
|
||||
Here is a collection of links to the original OCR engine's documentation listing the OCR languages.
|
||||
|
||||
- [EasyOCR](https://www.jaided.ai/easyocr/)
|
||||
- [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
|
||||
- [RapidOCR](https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/)
|
||||
- [Mac OCR](https://github.com/straussmaximilian/ocrmac/tree/main?tab=readme-ov-file#example-select-language-preference)
|
||||
|
||||
Setting the OCR language in Docling is done via the OCR pipeline options:
|
||||
|
||||
```py
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR
|
||||
```
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR
|
||||
```
|
||||
|
1025
poetry.lock
generated
1025
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "docling"
|
||||
version = "2.8.1" # DO NOT EDIT, updated automatically
|
||||
version = "2.8.2" # DO NOT EDIT, updated automatically
|
||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||
license = "MIT"
|
||||
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
||||
######################
|
||||
python = "^3.9"
|
||||
pydantic = ">=2.0.0,<2.10"
|
||||
docling-core = "^2.5.1"
|
||||
docling-core = "^2.6.1"
|
||||
docling-ibm-models = "^2.0.6"
|
||||
deepsearch-glm = "^0.26.1"
|
||||
filetype = "^1.2.0"
|
||||
@ -90,10 +90,13 @@ langchain-huggingface = "^0.0.3"
|
||||
langchain-milvus = "^0.1.4"
|
||||
langchain-text-splitters = "^0.2.4"
|
||||
|
||||
[tool.poetry.group.constraints]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.constraints.dependencies]
|
||||
numpy = [
|
||||
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
|
||||
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
|
||||
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
||||
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
||||
]
|
||||
|
||||
[tool.poetry.group.mac_intel]
|
||||
|
@ -10,7 +10,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
GENERATE = True
|
||||
GENERATE = False
|
||||
|
||||
|
||||
def get_pdf_path():
|
||||
|
45
tests/test_invalid_input.py
Normal file
45
tests/test_invalid_input.py
Normal file
@ -0,0 +1,45 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream
|
||||
from docling.document_converter import ConversionError, DocumentConverter
|
||||
|
||||
|
||||
def get_pdf_path():
|
||||
|
||||
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
|
||||
return pdf_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter():
|
||||
converter = DocumentConverter()
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
|
||||
result = converter.convert(
|
||||
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
|
||||
)
|
||||
assert result.status == ConversionStatus.SKIPPED
|
||||
|
||||
|
||||
def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
|
||||
with pytest.raises(ConversionError):
|
||||
converter.convert(
|
||||
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
|
||||
raises_on_error=True,
|
||||
)
|
||||
|
||||
|
||||
def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
|
||||
result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
|
||||
assert result.status == ConversionStatus.FAILURE
|
||||
|
||||
|
||||
def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
|
||||
with pytest.raises(ConversionError):
|
||||
converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)
|
Loading…
Reference in New Issue
Block a user