mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Merge branch 'DS4SD:main' into simonas/base-options
This commit is contained in:
commit
1c14a2ac56
18
CHANGELOG.md
18
CHANGELOG.md
@ -1,3 +1,21 @@
|
|||||||
|
## [v2.8.2](https://github.com/DS4SD/docling/releases/tag/v2.8.2) - 2024-12-03
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* ParserError EOF inside string (#470) ([#472](https://github.com/DS4SD/docling/issues/472)) ([`c90c41c`](https://github.com/DS4SD/docling/commit/c90c41c391de4366db554d7a71ce9a35467c981e))
|
||||||
|
* PermissionError when using tesseract_ocr_cli_model ([#496](https://github.com/DS4SD/docling/issues/496)) ([`d3f84b2`](https://github.com/DS4SD/docling/commit/d3f84b2457125feacd0c21d6513e7ae69a308ea5))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Add styling for faq ([#502](https://github.com/DS4SD/docling/issues/502)) ([`5ba3807`](https://github.com/DS4SD/docling/commit/5ba3807f315a01b1a4e8df9bab40e34a4238205a))
|
||||||
|
* Typo in faq ([#484](https://github.com/DS4SD/docling/issues/484)) ([`33cff98`](https://github.com/DS4SD/docling/commit/33cff98d360c02a382a66850c696a0cf511659ac))
|
||||||
|
* Add automatic api reference ([#475](https://github.com/DS4SD/docling/issues/475)) ([`d487210`](https://github.com/DS4SD/docling/commit/d4872103b8f24e38b37a8cd3ac414d3e02e7d6e8))
|
||||||
|
* Introduce faq section ([#468](https://github.com/DS4SD/docling/issues/468)) ([`8ccb3c6`](https://github.com/DS4SD/docling/commit/8ccb3c6db69318789af7deec26cfa2a3fd71302e))
|
||||||
|
|
||||||
|
### Performance
|
||||||
|
|
||||||
|
* Prevent temp file leftovers, reuse core type ([#487](https://github.com/DS4SD/docling/issues/487)) ([`051789d`](https://github.com/DS4SD/docling/commit/051789d01706d3823dd6307eca4dc5faacd1b7ce))
|
||||||
|
|
||||||
## [v2.8.1](https://github.com/DS4SD/docling/releases/tag/v2.8.1) - 2024-11-29
|
## [v2.8.1](https://github.com/DS4SD/docling/releases/tag/v2.8.1) - 2024-11-29
|
||||||
|
|
||||||
### Fix
|
### Fix
|
||||||
|
@ -2,6 +2,7 @@ import importlib
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@ -9,7 +10,7 @@ from pathlib import Path
|
|||||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
@ -256,9 +257,10 @@ def convert(
|
|||||||
if from_formats is None:
|
if from_formats is None:
|
||||||
from_formats = [e for e in InputFormat]
|
from_formats = [e for e in InputFormat]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
input_doc_paths: List[Path] = []
|
input_doc_paths: List[Path] = []
|
||||||
for src in input_sources:
|
for src in input_sources:
|
||||||
source = resolve_file_source(source=src)
|
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
||||||
if not source.exists():
|
if not source.exists():
|
||||||
err_console.print(
|
err_console.print(
|
||||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||||
@ -302,7 +304,9 @@ def convert(
|
|||||||
ocr_options=ocr_options,
|
ocr_options=ocr_options,
|
||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = (
|
||||||
|
True # do_cell_matching
|
||||||
|
)
|
||||||
pipeline_options.table_structure_options.mode = table_mode
|
pipeline_options.table_structure_options.mode = table_mode
|
||||||
|
|
||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from io import BytesIO
|
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
|||||||
Size,
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||||
|
DocumentStream,
|
||||||
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
|
|||||||
FAILURE = auto()
|
FAILURE = auto()
|
||||||
SUCCESS = auto()
|
SUCCESS = auto()
|
||||||
PARTIAL_SUCCESS = auto()
|
PARTIAL_SUCCESS = auto()
|
||||||
|
SKIPPED = auto()
|
||||||
|
|
||||||
|
|
||||||
class InputFormat(str, Enum):
|
class InputFormat(str, Enum):
|
||||||
@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
|
|||||||
DOCUMENT_BACKEND = auto()
|
DOCUMENT_BACKEND = auto()
|
||||||
MODEL = auto()
|
MODEL = auto()
|
||||||
DOC_ASSEMBLER = auto()
|
DOC_ASSEMBLER = auto()
|
||||||
|
USER_INPUT = auto()
|
||||||
|
|
||||||
|
|
||||||
class ErrorItem(BaseModel):
|
class ErrorItem(BaseModel):
|
||||||
@ -207,10 +211,3 @@ class Page(BaseModel):
|
|||||||
@property
|
@property
|
||||||
def image(self) -> Optional[Image]:
|
def image(self) -> Optional[Image]:
|
||||||
return self.get_image(scale=self._default_image_scale)
|
return self.get_image(scale=self._default_image_scale)
|
||||||
|
|
||||||
|
|
||||||
class DocumentStream(BaseModel):
|
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
||||||
|
|
||||||
name: str
|
|
||||||
stream: BytesIO
|
|
||||||
|
@ -3,7 +3,7 @@ import re
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
|||||||
)
|
)
|
||||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -164,12 +164,6 @@ class InputDocument(BaseModel):
|
|||||||
backend: Type[AbstractDocumentBackend],
|
backend: Type[AbstractDocumentBackend],
|
||||||
path_or_stream: Union[BytesIO, Path],
|
path_or_stream: Union[BytesIO, Path],
|
||||||
) -> None:
|
) -> None:
|
||||||
if backend is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
|
||||||
f"Please check your format configuration on DocumentConverter."
|
|
||||||
)
|
|
||||||
|
|
||||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||||
if not self._backend.is_valid():
|
if not self._backend.is_valid():
|
||||||
self.valid = False
|
self.valid = False
|
||||||
@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
|
|||||||
return ds_doc
|
return ds_doc
|
||||||
|
|
||||||
|
|
||||||
|
class _DummyBackend(AbstractDocumentBackend):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
return super().unload()
|
||||||
|
|
||||||
|
|
||||||
class _DocumentConversionInput(BaseModel):
|
class _DocumentConversionInput(BaseModel):
|
||||||
|
|
||||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||||
@ -459,13 +472,14 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
for item in self.path_or_stream_iterator:
|
for item in self.path_or_stream_iterator:
|
||||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
|
backend: Type[AbstractDocumentBackend]
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
_log.info(
|
_log.error(
|
||||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
f"Input document {obj.name} does not match any allowed format."
|
||||||
)
|
)
|
||||||
continue
|
backend = _DummyBackend
|
||||||
else:
|
else:
|
||||||
backend = format_options[format].backend
|
backend = format_options[format].backend
|
||||||
|
|
||||||
|
@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|||||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
from docling.datamodel.base_models import (
|
||||||
|
ConversionStatus,
|
||||||
|
DoclingComponentType,
|
||||||
|
DocumentStream,
|
||||||
|
ErrorItem,
|
||||||
|
InputFormat,
|
||||||
|
)
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
@ -23,6 +29,7 @@ from docling.datamodel.document import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import DocumentLimits, settings
|
from docling.datamodel.settings import DocumentLimits, settings
|
||||||
|
from docling.exceptions import ConversionError
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docling.pipeline.base_pipeline import BasePipeline
|
||||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
@ -85,7 +92,8 @@ class ImageFormatOption(FormatOption):
|
|||||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
_format_to_default_options = {
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||||
|
format_to_default_options = {
|
||||||
InputFormat.XLSX: FormatOption(
|
InputFormat.XLSX: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||||
),
|
),
|
||||||
@ -111,6 +119,10 @@ _format_to_default_options = {
|
|||||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
if (options := format_to_default_options.get(format)) is not None:
|
||||||
|
return options
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"No default options configured for {format}")
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverter:
|
class DocumentConverter:
|
||||||
@ -121,36 +133,26 @@ class DocumentConverter:
|
|||||||
allowed_formats: Optional[List[InputFormat]] = None,
|
allowed_formats: Optional[List[InputFormat]] = None,
|
||||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||||
):
|
):
|
||||||
self.allowed_formats = allowed_formats
|
self.allowed_formats = (
|
||||||
self.format_to_options = format_options
|
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
||||||
|
)
|
||||||
if self.allowed_formats is None:
|
self.format_to_options = {
|
||||||
# if self.format_to_options is not None:
|
format: (
|
||||||
# self.allowed_formats = self.format_to_options.keys()
|
_get_default_option(format=format)
|
||||||
# else:
|
if (custom_option := (format_options or {}).get(format)) is None
|
||||||
self.allowed_formats = [e for e in InputFormat] # all formats
|
else custom_option
|
||||||
|
)
|
||||||
if self.format_to_options is None:
|
for format in self.allowed_formats
|
||||||
self.format_to_options = _format_to_default_options
|
}
|
||||||
else:
|
|
||||||
for f in self.allowed_formats:
|
|
||||||
if f not in self.format_to_options.keys():
|
|
||||||
_log.debug(f"Requested format {f} will use default options.")
|
|
||||||
self.format_to_options[f] = _format_to_default_options[f]
|
|
||||||
|
|
||||||
remove_keys = []
|
|
||||||
for f in self.format_to_options.keys():
|
|
||||||
if f not in self.allowed_formats:
|
|
||||||
remove_keys.append(f)
|
|
||||||
|
|
||||||
for f in remove_keys:
|
|
||||||
self.format_to_options.pop(f)
|
|
||||||
|
|
||||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||||
|
|
||||||
def initialize_pipeline(self, format: InputFormat):
|
def initialize_pipeline(self, format: InputFormat):
|
||||||
"""Initialize the conversion pipeline for the selected format."""
|
"""Initialize the conversion pipeline for the selected format."""
|
||||||
self._get_pipeline(doc_format=format)
|
pipeline = self._get_pipeline(doc_format=format)
|
||||||
|
if pipeline is None:
|
||||||
|
raise ConversionError(
|
||||||
|
f"No pipeline could be initialized for format {format}"
|
||||||
|
)
|
||||||
|
|
||||||
@validate_call(config=ConfigDict(strict=True))
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
def convert(
|
def convert(
|
||||||
@ -186,22 +188,28 @@ class DocumentConverter:
|
|||||||
limits=limits,
|
limits=limits,
|
||||||
)
|
)
|
||||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||||
|
|
||||||
|
had_result = False
|
||||||
for conv_res in conv_res_iter:
|
for conv_res in conv_res_iter:
|
||||||
|
had_result = True
|
||||||
if raises_on_error and conv_res.status not in {
|
if raises_on_error and conv_res.status not in {
|
||||||
ConversionStatus.SUCCESS,
|
ConversionStatus.SUCCESS,
|
||||||
ConversionStatus.PARTIAL_SUCCESS,
|
ConversionStatus.PARTIAL_SUCCESS,
|
||||||
}:
|
}:
|
||||||
raise RuntimeError(
|
raise ConversionError(
|
||||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
yield conv_res
|
yield conv_res
|
||||||
|
|
||||||
|
if not had_result and raises_on_error:
|
||||||
|
raise ConversionError(
|
||||||
|
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||||
|
)
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||||
) -> Iterator[ConversionResult]:
|
) -> Iterator[ConversionResult]:
|
||||||
assert self.format_to_options is not None
|
|
||||||
|
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|
||||||
for input_batch in chunkify(
|
for input_batch in chunkify(
|
||||||
@ -223,27 +231,22 @@ class DocumentConverter:
|
|||||||
):
|
):
|
||||||
elapsed = time.monotonic() - start_time
|
elapsed = time.monotonic() - start_time
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|
||||||
if item is not None:
|
|
||||||
_log.info(
|
_log.info(
|
||||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||||
)
|
)
|
||||||
yield item
|
yield item
|
||||||
else:
|
|
||||||
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
|
||||||
|
|
||||||
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
||||||
assert self.format_to_options is not None
|
|
||||||
|
|
||||||
fopt = self.format_to_options.get(doc_format)
|
fopt = self.format_to_options.get(doc_format)
|
||||||
|
|
||||||
if fopt is None:
|
if fopt is None:
|
||||||
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
return None
|
||||||
else:
|
else:
|
||||||
pipeline_class = fopt.pipeline_cls
|
pipeline_class = fopt.pipeline_cls
|
||||||
pipeline_options = fopt.pipeline_options
|
pipeline_options = fopt.pipeline_options
|
||||||
|
|
||||||
assert pipeline_options is not None
|
if pipeline_options is None:
|
||||||
|
return None
|
||||||
# TODO this will ignore if different options have been defined for the same pipeline class.
|
# TODO this will ignore if different options have been defined for the same pipeline class.
|
||||||
if (
|
if (
|
||||||
pipeline_class not in self.initialized_pipelines
|
pipeline_class not in self.initialized_pipelines
|
||||||
@ -257,11 +260,26 @@ class DocumentConverter:
|
|||||||
|
|
||||||
def _process_document(
|
def _process_document(
|
||||||
self, in_doc: InputDocument, raises_on_error: bool
|
self, in_doc: InputDocument, raises_on_error: bool
|
||||||
) -> Optional[ConversionResult]:
|
) -> ConversionResult:
|
||||||
assert self.allowed_formats is not None
|
|
||||||
assert in_doc.format in self.allowed_formats
|
|
||||||
|
|
||||||
|
valid = (
|
||||||
|
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
||||||
|
)
|
||||||
|
if valid:
|
||||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||||
|
else:
|
||||||
|
error_message = f"File format not allowed: {in_doc.file}"
|
||||||
|
if raises_on_error:
|
||||||
|
raise ConversionError(error_message)
|
||||||
|
else:
|
||||||
|
error_item = ErrorItem(
|
||||||
|
component_type=DoclingComponentType.USER_INPUT,
|
||||||
|
module_name="",
|
||||||
|
error_message=error_message,
|
||||||
|
)
|
||||||
|
conv_res = ConversionResult(
|
||||||
|
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
||||||
|
)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@ -270,26 +288,28 @@ class DocumentConverter:
|
|||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
if in_doc.valid:
|
if in_doc.valid:
|
||||||
pipeline = self._get_pipeline(in_doc.format)
|
pipeline = self._get_pipeline(in_doc.format)
|
||||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
if pipeline is not None:
|
||||||
|
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||||
|
else:
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
raise RuntimeError(
|
raise ConversionError(
|
||||||
f"No pipeline could be initialized for {in_doc.file}."
|
f"No pipeline could be initialized for {in_doc.file}."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
input=in_doc,
|
||||||
return conv_res
|
status=ConversionStatus.FAILURE,
|
||||||
|
)
|
||||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# invalid doc or not of desired format
|
# invalid doc or not of desired format
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
input=in_doc,
|
||||||
|
status=ConversionStatus.FAILURE,
|
||||||
|
)
|
||||||
# TODO add error log why it failed.
|
# TODO add error log why it failed.
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
6
docling/exceptions.py
Normal file
6
docling/exceptions.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
class BaseError(RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ConversionError(BaseError):
|
||||||
|
pass
|
@ -1,5 +1,7 @@
|
|||||||
|
import csv
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from subprocess import DEVNULL, PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import Iterable, Optional, Tuple
|
from typing import Iterable, Optional, Tuple
|
||||||
@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
# _log.info(decoded_data)
|
# _log.info(decoded_data)
|
||||||
|
|
||||||
# Read the TSV file generated by Tesseract
|
# Read the TSV file generated by Tesseract
|
||||||
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
||||||
|
|
||||||
# Display the dataframe (optional)
|
# Display the dataframe (optional)
|
||||||
# _log.info("df: ", df.head())
|
# _log.info("df: ", df.head())
|
||||||
@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
with tempfile.NamedTemporaryFile(
|
with tempfile.NamedTemporaryFile(
|
||||||
suffix=".png", mode="w"
|
suffix=".png", mode="w+b", delete=False
|
||||||
) as image_file:
|
) as image_file:
|
||||||
fname = image_file.name
|
fname = image_file.name
|
||||||
high_res_image.save(fname)
|
high_res_image.save(image_file)
|
||||||
|
|
||||||
df = self._run_tesseract(fname)
|
df = self._run_tesseract(fname)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(fname):
|
||||||
|
os.remove(fname)
|
||||||
|
|
||||||
# _log.info(df)
|
# _log.info(df)
|
||||||
|
|
||||||
|
31
docs/faq.md
31
docs/faq.md
@ -3,7 +3,9 @@
|
|||||||
This is a collection of FAQ collected from the user questions on <https://github.com/DS4SD/docling/discussions>.
|
This is a collection of FAQ collected from the user questions on <https://github.com/DS4SD/docling/discussions>.
|
||||||
|
|
||||||
|
|
||||||
### Python 3.13 support
|
??? question "Is Python 3.13 supported?"
|
||||||
|
|
||||||
|
### Is Python 3.13 supported?
|
||||||
|
|
||||||
Full support for Python 3.13 is currently waiting for [pytorch](https://github.com/pytorch/pytorch).
|
Full support for Python 3.13 is currently waiting for [pytorch](https://github.com/pytorch/pytorch).
|
||||||
|
|
||||||
@ -15,7 +17,7 @@ python3.13 -m venv venv
|
|||||||
source ./venv/bin/activate
|
source ./venv/bin/activate
|
||||||
|
|
||||||
# Install torch nightly builds, see https://pytorch.org/
|
# Install torch nightly builds, see https://pytorch.org/
|
||||||
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||||
|
|
||||||
# Install docling
|
# Install docling
|
||||||
pip3 install docling
|
pip3 install docling
|
||||||
@ -29,8 +31,14 @@ _Note: we are disabling OCR since easyocr and the nightly torch builds have some
|
|||||||
Source: Issue [#136](https://github.com/DS4SD/docling/issues/136)
|
Source: Issue [#136](https://github.com/DS4SD/docling/issues/136)
|
||||||
|
|
||||||
|
|
||||||
|
??? question "Install conflicts with numpy (python 3.13)"
|
||||||
|
|
||||||
### Install conflicts with numpy (python 3.13)
|
### Install conflicts with numpy (python 3.13)
|
||||||
|
|
||||||
|
When using `docling-ibm-models>=2.0.7` and `deepsearch-glm>=0.26.2` these issues should not show up anymore.
|
||||||
|
Docling supports numpy versions `>=1.24.4,<3.0.0` which should match all usages.
|
||||||
|
|
||||||
|
**For older versions**
|
||||||
|
|
||||||
This has been observed installing docling and langchain via poetry.
|
This has been observed installing docling and langchain via poetry.
|
||||||
|
|
||||||
@ -54,19 +62,20 @@ numpy = [
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
Source: Issue [#283](https://github.com/DS4SD/docling/issues/283#issuecomment-2465035868)
|
Source: Issue [#283](https://github.com/DS4SD/docling/issues/283#issuecomment-2465035868)
|
||||||
|
|
||||||
|
|
||||||
### GPU support
|
??? question "Are text styles (bold, underline, etc) supported?"
|
||||||
|
|
||||||
TBA
|
### Are text styles (bold, underline, etc) supported?
|
||||||
|
|
||||||
|
Currently text styles are not supported in the `DoclingDocument` format.
|
||||||
|
If you are interest in contributing this feature, please open a discussion topic to brainstorm on the design.
|
||||||
|
|
||||||
|
_Note: this is not a simple topic_
|
||||||
|
|
||||||
|
|
||||||
### Text styles (bold, underline, etc)
|
??? question "How do I run completely offline?"
|
||||||
|
|
||||||
TBA
|
|
||||||
|
|
||||||
|
|
||||||
### How do I run completely offline?
|
### How do I run completely offline?
|
||||||
|
|
||||||
@ -89,6 +98,7 @@ converter = DocumentConverter(
|
|||||||
Source: Issue [#326](https://github.com/DS4SD/docling/issues/326)
|
Source: Issue [#326](https://github.com/DS4SD/docling/issues/326)
|
||||||
|
|
||||||
|
|
||||||
|
??? question " Which model weights are needed to run Docling?"
|
||||||
### Which model weights are needed to run Docling?
|
### Which model weights are needed to run Docling?
|
||||||
|
|
||||||
Model weights are needed for the AI models used in the PDF pipeline. Other document types (docx, pptx, etc) do not have any such requirement.
|
Model weights are needed for the AI models used in the PDF pipeline. Other document types (docx, pptx, etc) do not have any such requirement.
|
||||||
@ -98,6 +108,7 @@ For processing PDF documents, Docling requires the model weights from <https://h
|
|||||||
When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/DS4SD/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior.
|
When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/DS4SD/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior.
|
||||||
|
|
||||||
|
|
||||||
|
??? question "SSL error downloading model weights"
|
||||||
|
|
||||||
### SSL error downloading model weights
|
### SSL error downloading model weights
|
||||||
|
|
||||||
@ -114,6 +125,8 @@ Possible solutions were
|
|||||||
- Use [pip-system-certs](https://pypi.org/project/pip-system-certs/) to use the latest trusted certificates on your system.
|
- Use [pip-system-certs](https://pypi.org/project/pip-system-certs/) to use the latest trusted certificates on your system.
|
||||||
|
|
||||||
|
|
||||||
|
??? question "Which OCR languages are supported?"
|
||||||
|
|
||||||
### Which OCR languages are supported?
|
### Which OCR languages are supported?
|
||||||
|
|
||||||
Docling supports multiple OCR engine, each one has its own list of supported languages.
|
Docling supports multiple OCR engine, each one has its own list of supported languages.
|
||||||
|
1025
poetry.lock
generated
1025
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.8.1" # DO NOT EDIT, updated automatically
|
version = "2.8.2" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
|||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pydantic = ">=2.0.0,<2.10"
|
pydantic = ">=2.0.0,<2.10"
|
||||||
docling-core = "^2.5.1"
|
docling-core = "^2.6.1"
|
||||||
docling-ibm-models = "^2.0.6"
|
docling-ibm-models = "^2.0.6"
|
||||||
deepsearch-glm = "^0.26.1"
|
deepsearch-glm = "^0.26.1"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
@ -90,10 +90,13 @@ langchain-huggingface = "^0.0.3"
|
|||||||
langchain-milvus = "^0.1.4"
|
langchain-milvus = "^0.1.4"
|
||||||
langchain-text-splitters = "^0.2.4"
|
langchain-text-splitters = "^0.2.4"
|
||||||
|
|
||||||
|
[tool.poetry.group.constraints]
|
||||||
|
optional = true
|
||||||
|
|
||||||
[tool.poetry.group.constraints.dependencies]
|
[tool.poetry.group.constraints.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
|
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
||||||
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
|
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.poetry.group.mac_intel]
|
[tool.poetry.group.mac_intel]
|
||||||
|
@ -10,7 +10,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
|
|
||||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||||
|
|
||||||
GENERATE = True
|
GENERATE = False
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_path():
|
def get_pdf_path():
|
||||||
|
45
tests/test_invalid_input.py
Normal file
45
tests/test_invalid_input.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream
|
||||||
|
from docling.document_converter import ConversionError, DocumentConverter
|
||||||
|
|
||||||
|
|
||||||
|
def get_pdf_path():
|
||||||
|
|
||||||
|
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
|
||||||
|
return pdf_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def converter():
|
||||||
|
converter = DocumentConverter()
|
||||||
|
|
||||||
|
return converter
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
|
||||||
|
result = converter.convert(
|
||||||
|
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
|
||||||
|
)
|
||||||
|
assert result.status == ConversionStatus.SKIPPED
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
|
||||||
|
with pytest.raises(ConversionError):
|
||||||
|
converter.convert(
|
||||||
|
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
|
||||||
|
raises_on_error=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
|
||||||
|
result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
|
||||||
|
assert result.status == ConversionStatus.FAILURE
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
|
||||||
|
with pytest.raises(ConversionError):
|
||||||
|
converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)
|
Loading…
Reference in New Issue
Block a user