More renaming, design enrichment interface

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 10:21:31 +02:00
parent 051beae203
commit 304d16029a
17 changed files with 85 additions and 52 deletions

View File

@ -270,11 +270,12 @@ conv_input = DocumentConversionInput.from_paths(
### Convert from binary PDF streams ### Convert from binary PDF streams
You can convert PDFs from a binary stream instead of from the filesystem as follows: You can convert PDFs from a binary stream instead of from the filesystem as follows:
```python ```python
buf = BytesIO(your_binary_stream) buf = BytesIO(your_binary_stream)
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs) conv_input = DocumentConversionInput.from_streams(docs)
results = doc_converter.convert(conv_input) results = doc_converter.convert_batch(conv_input)
``` ```
### Limit resource usage ### Limit resource usage

View File

@ -94,21 +94,21 @@ def export_documents(
fname = output_dir / f"{doc_filename}.txt" fname = output_dir / f"{doc_filename}.txt"
with fname.open("w") as fp: with fname.open("w") as fp:
_log.info(f"writing Text output to {fname}") _log.info(f"writing Text output to {fname}")
fp.write(conv_res.render_as_text_v1()) fp.write(conv_res.render_as_text())
# Export Markdown format: # Export Markdown format:
if export_md: if export_md:
fname = output_dir / f"{doc_filename}.md" fname = output_dir / f"{doc_filename}.md"
with fname.open("w") as fp: with fname.open("w") as fp:
_log.info(f"writing Markdown output to {fname}") _log.info(f"writing Markdown output to {fname}")
fp.write(conv_res.render_as_markdown_v1()) fp.write(conv_res.render_as_markdown())
# Export Document Tags format: # Export Document Tags format:
if export_doctags: if export_doctags:
fname = output_dir / f"{doc_filename}.doctags" fname = output_dir / f"{doc_filename}.doctags"
with fname.open("w") as fp: with fname.open("w") as fp:
_log.info(f"writing Doc Tags output to {fname}") _log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.render_as_doctags_v1()) fp.write(conv_res.render_as_doctags())
else: else:
_log.warning(f"Document {conv_res.input.file} failed to convert.") _log.warning(f"Document {conv_res.input.file} failed to convert.")
@ -236,7 +236,7 @@ def convert(
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert_batch(input)
output.mkdir(parents=True, exist_ok=True) output.mkdir(parents=True, exist_ok=True)
export_documents( export_documents(

View File

@ -351,11 +351,11 @@ class ConvertedDocument(BaseModel):
return ds_doc return ds_doc
@deprecated("Use output.export_to_dict() instead.") @deprecated("Use output.export_to_dict() instead.")
def render_as_dict_v1(self): def render_as_dict(self):
return self.legacy_output.model_dump(by_alias=True, exclude_none=True) return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
@deprecated("Use output.export_to_markdown() instead.") @deprecated("Use output.export_to_markdown() instead.")
def render_as_markdown_v1( def render_as_markdown(
self, self,
delim: str = "\n\n", delim: str = "\n\n",
main_text_start: int = 0, main_text_start: int = 0,
@ -381,7 +381,7 @@ class ConvertedDocument(BaseModel):
) )
@deprecated("Use output.export_to_text() instead.") @deprecated("Use output.export_to_text() instead.")
def render_as_text_v1( def render_as_text(
self, self,
delim: str = "\n\n", delim: str = "\n\n",
main_text_start: int = 0, main_text_start: int = 0,
@ -402,7 +402,7 @@ class ConvertedDocument(BaseModel):
) )
@deprecated("Use output.export_to_document_tokens() instead.") @deprecated("Use output.export_to_document_tokens() instead.")
def render_as_doctags_v1( def render_as_doctags(
self, self,
delim: str = "\n\n", delim: str = "\n\n",
main_text_start: int = 0, main_text_start: int = 0,
@ -501,11 +501,12 @@ class DocumentConversionInput(BaseModel):
mime = filetype.guess_mime(str(obj)) mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream): elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192)) mime = filetype.guess_mime(obj.stream.read(8192))
else:
1 == 1 # alert!!
if mime is None: if mime is None:
# TODO improve this.
if obj.suffix == ".html": if obj.suffix == ".html":
mime = "text/html" mime = "text/html"
format = MimeTypeToFormat.get(mime) format = MimeTypeToFormat.get(mime)
return format return format

View File

@ -14,6 +14,7 @@ from pydantic import (
field_validator, field_validator,
model_validator, model_validator,
) )
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@ -28,7 +29,7 @@ from docling.datamodel.document import (
) )
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.pipeline.base_model_pipeline import BaseModelPipeline from docling.pipeline.base_model_pipeline import AbstractModelPipeline
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
from docling.utils.utils import chunkify from docling.utils.utils import chunkify
@ -37,7 +38,7 @@ _log = logging.getLogger(__name__)
class FormatOption(BaseModel): class FormatOption(BaseModel):
pipeline_cls: Type[BaseModelPipeline] pipeline_cls: Type[AbstractModelPipeline]
pipeline_options: Optional[PipelineOptions] = None pipeline_options: Optional[PipelineOptions] = None
backend: Type[AbstractDocumentBackend] backend: Type[AbstractDocumentBackend]
@ -114,11 +115,17 @@ class DocumentConverter:
_log.info(f"Requested format {f} will use default options.") _log.info(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f] self.format_to_options[f] = _format_to_default_options[f]
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = ( self.initialized_pipelines: Dict[
{} Type[AbstractModelPipeline], AbstractModelPipeline
) ] = {}
@deprecated("Use convert_batch instead.")
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
yield from self.convert_batch(input=input)
def convert_batch(
self, input: DocumentConversionInput, raise_on_error: bool = False
) -> Iterable[ConversionResult]:
for input_batch in chunkify( for input_batch in chunkify(
input.docs(self.format_to_options), input.docs(self.format_to_options),
@ -136,7 +143,9 @@ class DocumentConverter:
if item is not None: if item is not None:
yield item yield item
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult: def convert_single(
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
) -> ConversionResult:
"""Convert a single document. """Convert a single document.
Args: Args:
@ -177,7 +186,7 @@ class DocumentConverter:
f"Unexpected file path type encountered: {type(source)}" f"Unexpected file path type encountered: {type(source)}"
) )
conv_inp = DocumentConversionInput.from_paths(paths=[local_path]) conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
conv_res_iter = self.convert(conv_inp) conv_res_iter = self.convert_batch(conv_inp)
conv_res: ConversionResult = next(conv_res_iter) conv_res: ConversionResult = next(conv_res_iter)
if conv_res.status not in { if conv_res.status not in {
ConversionStatus.SUCCESS, ConversionStatus.SUCCESS,
@ -186,7 +195,7 @@ class DocumentConverter:
raise RuntimeError(f"Conversion failed with status: {conv_res.status}") raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
return conv_res return conv_res
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]: def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
fopt = self.format_to_options.get(doc.format) fopt = self.format_to_options.get(doc.format)
if fopt is None: if fopt is None:

View File

@ -1,5 +1,7 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Iterable from typing import Any, Iterable
from docling_core.types.experimental import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
@ -8,3 +10,11 @@ class AbstractPageModel(ABC):
@abstractmethod @abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
pass pass
class AbstractEnrichmentModel(ABC):
@abstractmethod
def __call__(
self, doc: DoclingDocument, elements: Iterable[NodeItem]
) -> Iterable[Any]:
pass

View File

@ -21,7 +21,7 @@ from docling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class BaseModelPipeline(ABC): class AbstractModelPipeline(ABC):
def __init__(self, pipeline_options: PipelineOptions): def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options self.pipeline_options = pipeline_options
self.model_pipe: List[Callable] = [] self.model_pipe: List[Callable] = []
@ -31,7 +31,7 @@ class BaseModelPipeline(ABC):
pass pass
@abstractmethod @abstractmethod
def assemble_document( def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult: ) -> ConversionResult:
pass pass
@ -47,9 +47,9 @@ class BaseModelPipeline(ABC):
pass pass
class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name. class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name.
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe: for model in self.model_pipe:
page_batch = model(page_batch) page_batch = model(page_batch)
@ -83,7 +83,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
) )
# 2. Run pipeline stages # 2. Run pipeline stages
pipeline_pages = self.apply_on_pages(init_pages) pipeline_pages = self._apply_on_pages(init_pages)
for p in pipeline_pages: # Must exhaust! for p in pipeline_pages: # Must exhaust!
pass pass
@ -91,7 +91,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
end_pb_time = time.time() - start_pb_time end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}") _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
conv_res = self.assemble_document(in_doc, conv_res) conv_res = self._assemble_document(in_doc, conv_res)
status = ConversionStatus.SUCCESS status = ConversionStatus.SUCCESS
for page in conv_res.pages: for page in conv_res.pages:

View File

@ -1,4 +1,7 @@
import logging import logging
from typing import Iterable
from docling_core.types.experimental import NodeItem
from docling.backend.abstract_backend import ( from docling.backend.abstract_backend import (
AbstractDocumentBackend, AbstractDocumentBackend,
@ -7,19 +10,19 @@ from docling.backend.abstract_backend import (
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.pipeline.base_model_pipeline import BaseModelPipeline from docling.pipeline.base_model_pipeline import AbstractModelPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class SimpleModelPipeline(BaseModelPipeline): class SimpleModelPipeline(AbstractModelPipeline):
"""SimpleModelPipeline. """SimpleModelPipeline.
This class is used at the moment for formats / backends This class is used at the moment for formats / backends
which produce straight DoclingDocument output. which produce straight DoclingDocument output.
""" """
def __init__(self, pipeline_options: PdfPipelineOptions): def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
def execute(self, in_doc: InputDocument) -> ConversionResult: def execute(self, in_doc: InputDocument) -> ConversionResult:
@ -45,16 +48,21 @@ class SimpleModelPipeline(BaseModelPipeline):
# a DoclingDocument straight. # a DoclingDocument straight.
conv_res.output = in_doc._backend.convert() conv_res.output = in_doc._backend.convert()
# Do other stuff with conv_res.experimental # Do other stuff with conv_res.experimental
conv_res = self.assemble_document(in_doc, conv_res) conv_res = self._assemble_document(in_doc, conv_res)
conv_res.status = ConversionStatus.SUCCESS conv_res.status = ConversionStatus.SUCCESS
return conv_res return conv_res
def assemble_document( # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
# for model in self.model_pipe:
# element_batch = model(element_batch)
#
# yield from element_batch
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult: ) -> ConversionResult:
return conv_res return conv_res

View File

@ -83,6 +83,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}), PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
] ]
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument
]
@staticmethod @staticmethod
def download_models_hf( def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False local_dir: Optional[Path] = None, force: bool = False
@ -104,7 +108,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
return page return page
def assemble_document( def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult: ) -> ConversionResult:
all_elements = [] all_elements = []

View File

@ -36,25 +36,25 @@ def export_documents(
with (output_dir / f"{doc_filename}.legacy.json").open( with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8" "w", encoding="utf-8"
) as fp: ) as fp:
fp.write(json.dumps(conv_res.render_as_dict_v1())) fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format: # Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open( with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8" "w", encoding="utf-8"
) as fp: ) as fp:
fp.write(conv_res.render_as_text_v1()) fp.write(conv_res.render_as_text())
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open( with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8" "w", encoding="utf-8"
) as fp: ) as fp:
fp.write(conv_res.render_as_markdown_v1()) fp.write(conv_res.render_as_markdown())
# Export Document Tags format: # Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open( with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8" "w", encoding="utf-8"
) as fp: ) as fp:
fp.write(conv_res.render_as_doctags_v1()) fp.write(conv_res.render_as_doctags())
if USE_V2: if USE_V2:
# Export Docling document format to JSON (experimental): # Export Docling document format to JSON (experimental):
@ -129,7 +129,7 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert_batch(input)
success_count, partial_success_count, failure_count = export_documents( success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch") conv_results, output_dir=Path("./scratch")
) )

View File

@ -39,17 +39,17 @@ def export_documents(
# Export Text format: # Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp: with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text_v1()) fp.write(conv_res.render_as_text())
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp: with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown_v1()) fp.write(conv_res.render_as_markdown())
# Export Document Tags format: # Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open( with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8" "w", encoding="utf-8"
) as fp: ) as fp:
fp.write(conv_res.render_as_doctags_v1()) fp.write(conv_res.render_as_doctags())
else: else:
_log.info(f"Document {conv_res.input.file} failed to convert.") _log.info(f"Document {conv_res.input.file} failed to convert.")
@ -157,7 +157,7 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert_batch(input)
success_count, failure_count = export_documents( success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch") conv_results, output_dir=Path("./scratch")
) )

View File

@ -42,7 +42,7 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input_files) conv_results = doc_converter.convert_batch(input_files)
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0

View File

@ -41,7 +41,7 @@ def main():
start_time = time.time() start_time = time.time()
converted_docs = doc_converter.convert(input_files) converted_docs = doc_converter.convert_batch(input_files)
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0

View File

@ -25,7 +25,7 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input_files) conv_results = doc_converter.convert_batch(input_files)
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0

View File

@ -50,7 +50,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
}, },
) )
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert_batch(input)
for res in conv_results: for res in conv_results:
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"

View File

@ -39,11 +39,11 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt") doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd: with open(doctags_fn, "w") as fd:
fd.write(doc_result.render_as_doctags_v1()) fd.write(doc_result.render_as_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md") md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd: with open(md_fn, "w") as fd:
fd.write(doc_result.render_as_markdown_v1()) fd.write(doc_result.render_as_markdown())
def get_pdf_paths(): def get_pdf_paths():

View File

@ -54,7 +54,7 @@ def test_batch_path(converter: DocumentConverter):
conv_input = DocumentConversionInput.from_paths([pdf_path]) conv_input = DocumentConversionInput.from_paths([pdf_path])
results = converter.convert(conv_input) results = converter.convert_batch(conv_input)
for doc_result in results: for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
@ -69,7 +69,7 @@ def test_batch_bytes(converter: DocumentConverter):
docs = [DocumentStream(name=pdf_path.name, stream=buf)] docs = [DocumentStream(name=pdf_path.name, stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs) conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert(conv_input) results = converter.convert_batch(conv_input)
for doc_result in results: for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)

View File

@ -198,8 +198,8 @@ def verify_conversion_result_v1(
doc_pred_pages: List[Page] = doc_result.pages doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_output doc_pred: DsDocument = doc_result.legacy_output
doc_pred_md = doc_result.render_as_markdown_v1() doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags_v1() doc_pred_dt = doc_result.render_as_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name