mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
fix: Call into docling-core for legacy document transform (#551)
Call into docling-core for legacy document transform Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from docling_core.utils.legacy import docling_document_to_legacy
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
|
||||
@property
|
||||
@deprecated("Use document instead.")
|
||||
def legacy_document(self):
|
||||
reverse_label_mapping = {
|
||||
DocItemLabel.CAPTION.value: "Caption",
|
||||
DocItemLabel.FOOTNOTE.value: "Footnote",
|
||||
DocItemLabel.FORMULA.value: "Formula",
|
||||
DocItemLabel.LIST_ITEM.value: "List-item",
|
||||
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
||||
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
||||
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
||||
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
||||
DocItemLabel.TABLE.value: "Table",
|
||||
DocItemLabel.TEXT.value: "Text",
|
||||
DocItemLabel.TITLE.value: "Title",
|
||||
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
||||
DocItemLabel.CODE.value: "Code",
|
||||
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
||||
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
||||
DocItemLabel.FORM.value: "Form",
|
||||
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
||||
DocItemLabel.PARAGRAPH.value: "paragraph",
|
||||
}
|
||||
|
||||
title = ""
|
||||
desc = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(
|
||||
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
||||
page=p.page_no,
|
||||
model="default",
|
||||
)
|
||||
for p in self.document.pages.values()
|
||||
]
|
||||
|
||||
file_info = DsFileInfoObject(
|
||||
filename=self.input.file.name,
|
||||
document_hash=self.input.document_hash,
|
||||
num_pages=self.input.page_count,
|
||||
page_hashes=page_hashes,
|
||||
)
|
||||
|
||||
main_text = []
|
||||
tables = []
|
||||
figures = []
|
||||
equations = []
|
||||
footnotes = []
|
||||
page_headers = []
|
||||
page_footers = []
|
||||
|
||||
embedded_captions = set()
|
||||
for ix, (item, level) in enumerate(
|
||||
self.document.iterate_items(self.document.body)
|
||||
):
|
||||
|
||||
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
||||
caption = item.caption_text(self.document)
|
||||
if caption:
|
||||
embedded_captions.add(caption)
|
||||
|
||||
for item, level in self.document.iterate_items():
|
||||
if isinstance(item, DocItem):
|
||||
item_type = item.label
|
||||
|
||||
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
||||
|
||||
if isinstance(item, ListItem) and item.marker:
|
||||
text = f"{item.marker} {item.text}"
|
||||
else:
|
||||
text = item.text
|
||||
|
||||
# Can be empty.
|
||||
prov = [
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, len(item.text)],
|
||||
)
|
||||
for p in item.prov
|
||||
]
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=text,
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
name=reverse_label_mapping[item.label],
|
||||
prov=prov,
|
||||
)
|
||||
)
|
||||
|
||||
# skip captions of they are embedded in the actual
|
||||
# floating object
|
||||
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
||||
continue
|
||||
|
||||
elif isinstance(item, TableItem) and item.data:
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=reverse_label_mapping[item.label],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
)
|
||||
for j in range(item.data.num_cols)
|
||||
]
|
||||
for i in range(item.data.num_rows)
|
||||
]
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in item.data.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, item.data.num_rows),
|
||||
min(cell.end_row_offset_idx, item.data.num_rows),
|
||||
):
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, item.data.num_cols),
|
||||
min(cell.end_col_offset_idx, item.data.num_cols),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(
|
||||
cell.start_row_offset_idx,
|
||||
item.data.num_rows,
|
||||
),
|
||||
min(
|
||||
cell.end_row_offset_idx, item.data.num_rows
|
||||
),
|
||||
):
|
||||
for cspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx,
|
||||
item.data.num_cols,
|
||||
),
|
||||
min(
|
||||
cell.end_col_offset_idx,
|
||||
item.data.num_cols,
|
||||
),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
|
||||
spans = list(make_spans(cell))
|
||||
table_data[i][j] = GlmTableCell(
|
||||
text=cell.text,
|
||||
bbox=(
|
||||
cell.bbox.as_tuple()
|
||||
if cell.bbox is not None
|
||||
else None
|
||||
), # check if this is bottom-left
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
col=j,
|
||||
row=i,
|
||||
row_header=cell.row_header,
|
||||
row_section=cell.row_section,
|
||||
col_header=cell.column_header,
|
||||
row_span=[
|
||||
cell.start_row_offset_idx,
|
||||
cell.end_row_offset_idx,
|
||||
],
|
||||
col_span=[
|
||||
cell.start_col_offset_idx,
|
||||
cell.end_col_offset_idx,
|
||||
],
|
||||
)
|
||||
|
||||
# Compute the caption
|
||||
caption = item.caption_text(self.document)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
text=caption,
|
||||
num_cols=item.data.num_cols,
|
||||
num_rows=item.data.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, 0],
|
||||
)
|
||||
for p in item.prov
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(item, PictureItem):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=reverse_label_mapping[item.label],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Compute the caption
|
||||
caption = item.caption_text(self.document)
|
||||
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, len(caption)],
|
||||
)
|
||||
for p in item.prov
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
text=caption,
|
||||
# data=[[]],
|
||||
)
|
||||
)
|
||||
|
||||
page_dimensions = [
|
||||
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
||||
for p in self.document.pages.values()
|
||||
]
|
||||
|
||||
ds_doc = DsDocument(
|
||||
name=title,
|
||||
description=desc,
|
||||
file_info=file_info,
|
||||
main_text=main_text,
|
||||
equations=equations,
|
||||
footnotes=footnotes,
|
||||
page_headers=page_headers,
|
||||
page_footers=page_footers,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
page_dimensions=page_dimensions,
|
||||
)
|
||||
|
||||
return ds_doc
|
||||
return docling_document_to_legacy(self.document)
|
||||
|
||||
|
||||
class _DummyBackend(AbstractDocumentBackend):
|
||||
|
||||
Reference in New Issue
Block a user