More renaming, design enrichment interface

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 10:21:31 +02:00
parent 051beae203
commit 304d16029a
17 changed files with 85 additions and 52 deletions

View File

@ -270,11 +270,12 @@ conv_input = DocumentConversionInput.from_paths(
### Convert from binary PDF streams
You can convert PDFs from a binary stream instead of from the filesystem as follows:
```python
buf = BytesIO(your_binary_stream)
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs)
results = doc_converter.convert(conv_input)
results = doc_converter.convert_batch(conv_input)
```
### Limit resource usage

View File

@ -94,21 +94,21 @@ def export_documents(
fname = output_dir / f"{doc_filename}.txt"
with fname.open("w") as fp:
_log.info(f"writing Text output to {fname}")
fp.write(conv_res.render_as_text_v1())
fp.write(conv_res.render_as_text())
# Export Markdown format:
if export_md:
fname = output_dir / f"{doc_filename}.md"
with fname.open("w") as fp:
_log.info(f"writing Markdown output to {fname}")
fp.write(conv_res.render_as_markdown_v1())
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
if export_doctags:
fname = output_dir / f"{doc_filename}.doctags"
with fname.open("w") as fp:
_log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.render_as_doctags_v1())
fp.write(conv_res.render_as_doctags())
else:
_log.warning(f"Document {conv_res.input.file} failed to convert.")
@ -236,7 +236,7 @@ def convert(
start_time = time.time()
conv_results = doc_converter.convert(input)
conv_results = doc_converter.convert_batch(input)
output.mkdir(parents=True, exist_ok=True)
export_documents(

View File

@ -351,11 +351,11 @@ class ConvertedDocument(BaseModel):
return ds_doc
@deprecated("Use output.export_to_dict() instead.")
def render_as_dict_v1(self):
def render_as_dict(self):
return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
@deprecated("Use output.export_to_markdown() instead.")
def render_as_markdown_v1(
def render_as_markdown(
self,
delim: str = "\n\n",
main_text_start: int = 0,
@ -381,7 +381,7 @@ class ConvertedDocument(BaseModel):
)
@deprecated("Use output.export_to_text() instead.")
def render_as_text_v1(
def render_as_text(
self,
delim: str = "\n\n",
main_text_start: int = 0,
@ -402,7 +402,7 @@ class ConvertedDocument(BaseModel):
)
@deprecated("Use output.export_to_document_tokens() instead.")
def render_as_doctags_v1(
def render_as_doctags(
self,
delim: str = "\n\n",
main_text_start: int = 0,
@ -501,11 +501,12 @@ class DocumentConversionInput(BaseModel):
mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
else:
1 == 1 # alert!!
if mime is None:
# TODO improve this.
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
return format

View File

@ -14,6 +14,7 @@ from pydantic import (
field_validator,
model_validator,
)
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@ -28,7 +29,7 @@ from docling.datamodel.document import (
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.pipeline.base_model_pipeline import BaseModelPipeline
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
from docling.utils.utils import chunkify
@ -37,7 +38,7 @@ _log = logging.getLogger(__name__)
class FormatOption(BaseModel):
pipeline_cls: Type[BaseModelPipeline]
pipeline_cls: Type[AbstractModelPipeline]
pipeline_options: Optional[PipelineOptions] = None
backend: Type[AbstractDocumentBackend]
@ -114,11 +115,17 @@ class DocumentConverter:
_log.info(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f]
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
{}
)
self.initialized_pipelines: Dict[
Type[AbstractModelPipeline], AbstractModelPipeline
] = {}
@deprecated("Use convert_batch instead.")
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
yield from self.convert_batch(input=input)
def convert_batch(
self, input: DocumentConversionInput, raise_on_error: bool = False
) -> Iterable[ConversionResult]:
for input_batch in chunkify(
input.docs(self.format_to_options),
@ -136,7 +143,9 @@ class DocumentConverter:
if item is not None:
yield item
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
def convert_single(
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
) -> ConversionResult:
"""Convert a single document.
Args:
@ -177,7 +186,7 @@ class DocumentConverter:
f"Unexpected file path type encountered: {type(source)}"
)
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
conv_res_iter = self.convert(conv_inp)
conv_res_iter = self.convert_batch(conv_inp)
conv_res: ConversionResult = next(conv_res_iter)
if conv_res.status not in {
ConversionStatus.SUCCESS,
@ -186,7 +195,7 @@ class DocumentConverter:
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
return conv_res
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
fopt = self.format_to_options.get(doc.format)
if fopt is None:

View File

@ -1,5 +1,7 @@
from abc import ABC, abstractmethod
from typing import Iterable
from typing import Any, Iterable
from docling_core.types.experimental import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page
@ -8,3 +10,11 @@ class AbstractPageModel(ABC):
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
pass
class AbstractEnrichmentModel(ABC):
@abstractmethod
def __call__(
self, doc: DoclingDocument, elements: Iterable[NodeItem]
) -> Iterable[Any]:
pass

View File

@ -21,7 +21,7 @@ from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class BaseModelPipeline(ABC):
class AbstractModelPipeline(ABC):
def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options
self.model_pipe: List[Callable] = []
@ -31,7 +31,7 @@ class BaseModelPipeline(ABC):
pass
@abstractmethod
def assemble_document(
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
pass
@ -47,9 +47,9 @@ class BaseModelPipeline(ABC):
pass
class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name.
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe:
page_batch = model(page_batch)
@ -83,7 +83,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
)
# 2. Run pipeline stages
pipeline_pages = self.apply_on_pages(init_pages)
pipeline_pages = self._apply_on_pages(init_pages)
for p in pipeline_pages: # Must exhaust!
pass
@ -91,7 +91,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
conv_res = self.assemble_document(in_doc, conv_res)
conv_res = self._assemble_document(in_doc, conv_res)
status = ConversionStatus.SUCCESS
for page in conv_res.pages:

View File

@ -1,4 +1,7 @@
import logging
from typing import Iterable
from docling_core.types.experimental import NodeItem
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
@ -7,19 +10,19 @@ from docling.backend.abstract_backend import (
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.pipeline.base_model_pipeline import BaseModelPipeline
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
_log = logging.getLogger(__name__)
class SimpleModelPipeline(BaseModelPipeline):
class SimpleModelPipeline(AbstractModelPipeline):
"""SimpleModelPipeline.
This class is used at the moment for formats / backends
which produce straight DoclingDocument output.
"""
def __init__(self, pipeline_options: PdfPipelineOptions):
def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options)
def execute(self, in_doc: InputDocument) -> ConversionResult:
@ -45,16 +48,21 @@ class SimpleModelPipeline(BaseModelPipeline):
# a DoclingDocument straight.
conv_res.output = in_doc._backend.convert()
# Do other stuff with conv_res.experimental
conv_res = self.assemble_document(in_doc, conv_res)
conv_res = self._assemble_document(in_doc, conv_res)
conv_res.status = ConversionStatus.SUCCESS
return conv_res
def assemble_document(
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
# for model in self.model_pipe:
# element_batch = model(element_batch)
#
# yield from element_batch
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
return conv_res

View File

@ -83,6 +83,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
]
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument
]
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
@ -104,7 +108,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
return page
def assemble_document(
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
all_elements = []

View File

@ -36,25 +36,25 @@ def export_documents(
with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict_v1()))
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_text_v1())
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_markdown_v1())
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags_v1())
fp.write(conv_res.render_as_doctags())
if USE_V2:
# Export Docling document format to JSON (experimental):
@ -129,7 +129,7 @@ def main():
start_time = time.time()
conv_results = doc_converter.convert(input)
conv_results = doc_converter.convert_batch(input)
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)

View File

@ -39,17 +39,17 @@ def export_documents(
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text_v1())
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown_v1())
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags_v1())
fp.write(conv_res.render_as_doctags())
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
@ -157,7 +157,7 @@ def main():
start_time = time.time()
conv_results = doc_converter.convert(input)
conv_results = doc_converter.convert_batch(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)

View File

@ -42,7 +42,7 @@ def main():
start_time = time.time()
conv_results = doc_converter.convert(input_files)
conv_results = doc_converter.convert_batch(input_files)
success_count = 0
failure_count = 0

View File

@ -41,7 +41,7 @@ def main():
start_time = time.time()
converted_docs = doc_converter.convert(input_files)
converted_docs = doc_converter.convert_batch(input_files)
success_count = 0
failure_count = 0

View File

@ -25,7 +25,7 @@ def main():
start_time = time.time()
conv_results = doc_converter.convert(input_files)
conv_results = doc_converter.convert_batch(input_files)
success_count = 0
failure_count = 0

View File

@ -50,7 +50,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
},
)
conv_results = doc_converter.convert(input)
conv_results = doc_converter.convert_batch(input)
for res in conv_results:
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"

View File

@ -39,11 +39,11 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd:
fd.write(doc_result.render_as_doctags_v1())
fd.write(doc_result.render_as_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd:
fd.write(doc_result.render_as_markdown_v1())
fd.write(doc_result.render_as_markdown())
def get_pdf_paths():

View File

@ -54,7 +54,7 @@ def test_batch_path(converter: DocumentConverter):
conv_input = DocumentConversionInput.from_paths([pdf_path])
results = converter.convert(conv_input)
results = converter.convert_batch(conv_input)
for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
@ -69,7 +69,7 @@ def test_batch_bytes(converter: DocumentConverter):
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert(conv_input)
results = converter.convert_batch(conv_input)
for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)

View File

@ -198,8 +198,8 @@ def verify_conversion_result_v1(
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_output
doc_pred_md = doc_result.render_as_markdown_v1()
doc_pred_dt = doc_result.render_as_doctags_v1()
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name