mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
More renaming, design enrichment interface
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
051beae203
commit
304d16029a
@ -270,11 +270,12 @@ conv_input = DocumentConversionInput.from_paths(
|
||||
### Convert from binary PDF streams
|
||||
|
||||
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
||||
|
||||
```python
|
||||
buf = BytesIO(your_binary_stream)
|
||||
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||
conv_input = DocumentConversionInput.from_streams(docs)
|
||||
results = doc_converter.convert(conv_input)
|
||||
results = doc_converter.convert_batch(conv_input)
|
||||
```
|
||||
### Limit resource usage
|
||||
|
||||
|
@ -94,21 +94,21 @@ def export_documents(
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Text output to {fname}")
|
||||
fp.write(conv_res.render_as_text_v1())
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
fp.write(conv_res.render_as_markdown_v1())
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
fp.write(conv_res.render_as_doctags_v1())
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
@ -236,7 +236,7 @@ def convert(
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
|
@ -351,11 +351,11 @@ class ConvertedDocument(BaseModel):
|
||||
return ds_doc
|
||||
|
||||
@deprecated("Use output.export_to_dict() instead.")
|
||||
def render_as_dict_v1(self):
|
||||
def render_as_dict(self):
|
||||
return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
|
||||
|
||||
@deprecated("Use output.export_to_markdown() instead.")
|
||||
def render_as_markdown_v1(
|
||||
def render_as_markdown(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
@ -381,7 +381,7 @@ class ConvertedDocument(BaseModel):
|
||||
)
|
||||
|
||||
@deprecated("Use output.export_to_text() instead.")
|
||||
def render_as_text_v1(
|
||||
def render_as_text(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
@ -402,7 +402,7 @@ class ConvertedDocument(BaseModel):
|
||||
)
|
||||
|
||||
@deprecated("Use output.export_to_document_tokens() instead.")
|
||||
def render_as_doctags_v1(
|
||||
def render_as_doctags(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
@ -501,11 +501,12 @@ class DocumentConversionInput(BaseModel):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
elif isinstance(obj, DocumentStream):
|
||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||
else:
|
||||
1 == 1 # alert!!
|
||||
if mime is None:
|
||||
# TODO improve this.
|
||||
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
|
@ -14,6 +14,7 @@ from pydantic import (
|
||||
field_validator,
|
||||
model_validator,
|
||||
)
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
@ -28,7 +29,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
from docling.utils.utils import chunkify
|
||||
@ -37,7 +38,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FormatOption(BaseModel):
|
||||
pipeline_cls: Type[BaseModelPipeline]
|
||||
pipeline_cls: Type[AbstractModelPipeline]
|
||||
pipeline_options: Optional[PipelineOptions] = None
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
|
||||
@ -114,11 +115,17 @@ class DocumentConverter:
|
||||
_log.info(f"Requested format {f} will use default options.")
|
||||
self.format_to_options[f] = _format_to_default_options[f]
|
||||
|
||||
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
|
||||
{}
|
||||
)
|
||||
self.initialized_pipelines: Dict[
|
||||
Type[AbstractModelPipeline], AbstractModelPipeline
|
||||
] = {}
|
||||
|
||||
@deprecated("Use convert_batch instead.")
|
||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
||||
yield from self.convert_batch(input=input)
|
||||
|
||||
def convert_batch(
|
||||
self, input: DocumentConversionInput, raise_on_error: bool = False
|
||||
) -> Iterable[ConversionResult]:
|
||||
|
||||
for input_batch in chunkify(
|
||||
input.docs(self.format_to_options),
|
||||
@ -136,7 +143,9 @@ class DocumentConverter:
|
||||
if item is not None:
|
||||
yield item
|
||||
|
||||
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
||||
def convert_single(
|
||||
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
|
||||
) -> ConversionResult:
|
||||
"""Convert a single document.
|
||||
|
||||
Args:
|
||||
@ -177,7 +186,7 @@ class DocumentConverter:
|
||||
f"Unexpected file path type encountered: {type(source)}"
|
||||
)
|
||||
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
||||
conv_res_iter = self.convert(conv_inp)
|
||||
conv_res_iter = self.convert_batch(conv_inp)
|
||||
conv_res: ConversionResult = next(conv_res_iter)
|
||||
if conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
@ -186,7 +195,7 @@ class DocumentConverter:
|
||||
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
||||
return conv_res
|
||||
|
||||
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
|
||||
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
|
||||
fopt = self.format_to_options.get(doc.format)
|
||||
|
||||
if fopt is None:
|
||||
|
@ -1,5 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterable
|
||||
from typing import Any, Iterable
|
||||
|
||||
from docling_core.types.experimental import DoclingDocument, NodeItem
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
|
||||
@ -8,3 +10,11 @@ class AbstractPageModel(ABC):
|
||||
@abstractmethod
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
pass
|
||||
|
||||
|
||||
class AbstractEnrichmentModel(ABC):
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self, doc: DoclingDocument, elements: Iterable[NodeItem]
|
||||
) -> Iterable[Any]:
|
||||
pass
|
||||
|
@ -21,7 +21,7 @@ from docling.utils.utils import chunkify
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseModelPipeline(ABC):
|
||||
class AbstractModelPipeline(ABC):
|
||||
def __init__(self, pipeline_options: PipelineOptions):
|
||||
self.pipeline_options = pipeline_options
|
||||
self.model_pipe: List[Callable] = []
|
||||
@ -31,7 +31,7 @@ class BaseModelPipeline(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def assemble_document(
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
pass
|
||||
@ -47,9 +47,9 @@ class BaseModelPipeline(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name.
|
||||
|
||||
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for model in self.model_pipe:
|
||||
page_batch = model(page_batch)
|
||||
|
||||
@ -83,7 +83,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
)
|
||||
|
||||
# 2. Run pipeline stages
|
||||
pipeline_pages = self.apply_on_pages(init_pages)
|
||||
pipeline_pages = self._apply_on_pages(init_pages)
|
||||
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
pass
|
||||
@ -91,7 +91,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
conv_res = self.assemble_document(in_doc, conv_res)
|
||||
conv_res = self._assemble_document(in_doc, conv_res)
|
||||
|
||||
status = ConversionStatus.SUCCESS
|
||||
for page in conv_res.pages:
|
||||
|
@ -1,4 +1,7 @@
|
||||
import logging
|
||||
from typing import Iterable
|
||||
|
||||
from docling_core.types.experimental import NodeItem
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
@ -7,19 +10,19 @@ from docling.backend.abstract_backend import (
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SimpleModelPipeline(BaseModelPipeline):
|
||||
class SimpleModelPipeline(AbstractModelPipeline):
|
||||
"""SimpleModelPipeline.
|
||||
|
||||
This class is used at the moment for formats / backends
|
||||
which produce straight DoclingDocument output.
|
||||
"""
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
def __init__(self, pipeline_options: PipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
|
||||
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||
@ -45,16 +48,21 @@ class SimpleModelPipeline(BaseModelPipeline):
|
||||
# a DoclingDocument straight.
|
||||
|
||||
conv_res.output = in_doc._backend.convert()
|
||||
|
||||
# Do other stuff with conv_res.experimental
|
||||
|
||||
conv_res = self.assemble_document(in_doc, conv_res)
|
||||
conv_res = self._assemble_document(in_doc, conv_res)
|
||||
|
||||
conv_res.status = ConversionStatus.SUCCESS
|
||||
|
||||
return conv_res
|
||||
|
||||
def assemble_document(
|
||||
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
||||
# for model in self.model_pipe:
|
||||
# element_batch = model(element_batch)
|
||||
#
|
||||
# yield from element_batch
|
||||
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
return conv_res
|
||||
|
@ -83,6 +83,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
|
||||
]
|
||||
|
||||
self.enrichment_pipe = [
|
||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
@ -104,7 +108,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
|
||||
return page
|
||||
|
||||
def assemble_document(
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
all_elements = []
|
||||
|
@ -36,25 +36,25 @@ def export_documents(
|
||||
with (output_dir / f"{doc_filename}.legacy.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict_v1()))
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.legacy.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_text_v1())
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.legacy.md").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_markdown_v1())
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags_v1())
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
if USE_V2:
|
||||
# Export Docling document format to JSON (experimental):
|
||||
@ -129,7 +129,7 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
@ -39,17 +39,17 @@ def export_documents(
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text_v1())
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown_v1())
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags_v1())
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
else:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
@ -157,7 +157,7 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
@ -42,7 +42,7 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
conv_results = doc_converter.convert_batch(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
@ -41,7 +41,7 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
converted_docs = doc_converter.convert(input_files)
|
||||
converted_docs = doc_converter.convert_batch(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
@ -25,7 +25,7 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
conv_results = doc_converter.convert_batch(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
@ -50,7 +50,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
||||
},
|
||||
)
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
|
||||
for res in conv_results:
|
||||
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
||||
|
@ -39,11 +39,11 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
||||
|
||||
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
||||
with open(doctags_fn, "w") as fd:
|
||||
fd.write(doc_result.render_as_doctags_v1())
|
||||
fd.write(doc_result.render_as_doctags())
|
||||
|
||||
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
||||
with open(md_fn, "w") as fd:
|
||||
fd.write(doc_result.render_as_markdown_v1())
|
||||
fd.write(doc_result.render_as_markdown())
|
||||
|
||||
|
||||
def get_pdf_paths():
|
||||
|
@ -54,7 +54,7 @@ def test_batch_path(converter: DocumentConverter):
|
||||
|
||||
conv_input = DocumentConversionInput.from_paths([pdf_path])
|
||||
|
||||
results = converter.convert(conv_input)
|
||||
results = converter.convert_batch(conv_input)
|
||||
for doc_result in results:
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
@ -69,7 +69,7 @@ def test_batch_bytes(converter: DocumentConverter):
|
||||
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
||||
conv_input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
results = converter.convert(conv_input)
|
||||
results = converter.convert_batch(conv_input)
|
||||
for doc_result in results:
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
|
@ -198,8 +198,8 @@ def verify_conversion_result_v1(
|
||||
|
||||
doc_pred_pages: List[Page] = doc_result.pages
|
||||
doc_pred: DsDocument = doc_result.legacy_output
|
||||
doc_pred_md = doc_result.render_as_markdown_v1()
|
||||
doc_pred_dt = doc_result.render_as_doctags_v1()
|
||||
doc_pred_md = doc_result.render_as_markdown()
|
||||
doc_pred_dt = doc_result.render_as_doctags()
|
||||
|
||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
||||
|
Loading…
Reference in New Issue
Block a user