fix: refine conversion result (#52)

- fields `output` & `assembled` need not be optional
- introduced "synonym" `ConversionResult` for `ConvertedDocument` & deprecated the latter

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas
2024-08-27 11:50:43 +02:00
committed by GitHub
parent fe817b11d7
commit e46a66a176
8 changed files with 96 additions and 90 deletions

View File

@@ -247,9 +247,9 @@ PageElement = Union[TextElement, TableElement, FigureElement]
class AssembledUnit(BaseModel):
elements: List[PageElement]
body: List[PageElement]
headers: List[PageElement]
elements: List[PageElement] = []
body: List[PageElement] = []
headers: List[PageElement] = []
class Page(BaseModel):

View File

@@ -12,6 +12,7 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -49,6 +50,15 @@ layout_label_to_ds_type = {
"Text": "paragraph",
}
_EMPTY_DOC = DsDocument(
_name="",
description=DsDocumentDescription(logs=[]),
file_info=DsFileInfoObject(
filename="",
document_hash="",
),
)
class InputDocument(BaseModel):
file: PurePath = None
@@ -115,6 +125,7 @@ class InputDocument(BaseModel):
# raise
@deprecated("Use `ConversionResult` instead.")
class ConvertedDocument(BaseModel):
input: InputDocument
@@ -122,11 +133,11 @@ class ConvertedDocument(BaseModel):
errors: List[ErrorItem] = [] # structure to keep errors
pages: List[Page] = []
assembled: Optional[AssembledUnit] = None
assembled: AssembledUnit = AssembledUnit()
output: Optional[DsDocument] = None
output: DsDocument = _EMPTY_DOC
def to_ds_document(self) -> DsDocument:
def _to_ds_document(self) -> DsDocument:
title = ""
desc = DsDocumentDescription(logs=[])
@@ -297,16 +308,10 @@ class ConvertedDocument(BaseModel):
return ds_doc
def render_as_dict(self):
if self.output:
return self.output.model_dump(by_alias=True, exclude_none=True)
else:
return {}
return self.output.model_dump(by_alias=True, exclude_none=True)
def render_as_markdown(self):
if self.output:
return self.output.export_to_markdown()
else:
return ""
return self.output.export_to_markdown()
def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
@@ -323,6 +328,10 @@ class ConvertedDocument(BaseModel):
yield element, cropped_im
class ConversionResult(ConvertedDocument):
pass
class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None