mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix: Use proper page concatentation in VLM pipeline MD/HTML conversion (#2458)
* Use proper page concatentation in VLM pipeline MD/HTML conversion Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -6,6 +6,7 @@ from typing import List, Optional, Union, cast
|
|||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
|
ContentLayer,
|
||||||
DocItem,
|
DocItem,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
ImageRef,
|
ImageRef,
|
||||||
@@ -251,9 +252,9 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
# No code blocks found, return original text
|
# No code blocks found, return original text
|
||||||
return text
|
return text
|
||||||
|
|
||||||
for pg_idx, page in enumerate(conv_res.pages):
|
page_docs = []
|
||||||
page_no = pg_idx + 1 # FIXME: might be incorrect
|
|
||||||
|
|
||||||
|
for pg_idx, page in enumerate(conv_res.pages):
|
||||||
predicted_text = ""
|
predicted_text = ""
|
||||||
if page.predictions.vlm_response:
|
if page.predictions.vlm_response:
|
||||||
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
||||||
@@ -273,22 +274,13 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
page_doc = backend.convert()
|
page_doc = backend.convert()
|
||||||
|
|
||||||
if page.image is not None:
|
# Modify provenance in place for all items in the page document
|
||||||
pg_width = page.image.width
|
for item, level in page_doc.iterate_items(
|
||||||
pg_height = page.image.height
|
with_groups=True,
|
||||||
else:
|
traverse_pictures=True,
|
||||||
pg_width = 1
|
included_content_layers=set(ContentLayer),
|
||||||
pg_height = 1
|
):
|
||||||
|
if isinstance(item, DocItem):
|
||||||
conv_res.document.add_page(
|
|
||||||
page_no=page_no,
|
|
||||||
size=Size(width=pg_width, height=pg_height),
|
|
||||||
image=ImageRef.from_pil(image=page.image, dpi=72)
|
|
||||||
if page.image
|
|
||||||
else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
for item, level in page_doc.iterate_items():
|
|
||||||
item.prov = [
|
item.prov = [
|
||||||
ProvenanceItem(
|
ProvenanceItem(
|
||||||
page_no=pg_idx + 1,
|
page_no=pg_idx + 1,
|
||||||
@@ -298,9 +290,27 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
charspan=[0, 0],
|
charspan=[0, 0],
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
conv_res.document.append_child_item(child=item)
|
|
||||||
|
|
||||||
return conv_res.document
|
# Add page metadata to the page document before concatenation
|
||||||
|
if page.image is not None:
|
||||||
|
pg_width = page.image.width
|
||||||
|
pg_height = page.image.height
|
||||||
|
else:
|
||||||
|
pg_width = 1
|
||||||
|
pg_height = 1
|
||||||
|
|
||||||
|
page_doc.add_page(
|
||||||
|
page_no=pg_idx + 1,
|
||||||
|
size=Size(width=pg_width, height=pg_height),
|
||||||
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
||||||
|
if page.image
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
page_docs.append(page_doc)
|
||||||
|
|
||||||
|
final_doc = DoclingDocument.concatenate(docs=page_docs)
|
||||||
|
return final_doc
|
||||||
|
|
||||||
def _turn_html_into_doc(self, conv_res):
|
def _turn_html_into_doc(self, conv_res):
|
||||||
def _extract_html_code(text):
|
def _extract_html_code(text):
|
||||||
@@ -328,9 +338,9 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
# No code blocks found, return original text
|
# No code blocks found, return original text
|
||||||
return text
|
return text
|
||||||
|
|
||||||
for pg_idx, page in enumerate(conv_res.pages):
|
page_docs = []
|
||||||
page_no = pg_idx + 1 # FIXME: might be incorrect
|
|
||||||
|
|
||||||
|
for pg_idx, page in enumerate(conv_res.pages):
|
||||||
predicted_text = ""
|
predicted_text = ""
|
||||||
if page.predictions.vlm_response:
|
if page.predictions.vlm_response:
|
||||||
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
||||||
@@ -341,7 +351,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
out_doc = InputDocument(
|
out_doc = InputDocument(
|
||||||
path_or_stream=response_bytes,
|
path_or_stream=response_bytes,
|
||||||
filename=conv_res.input.file.name,
|
filename=conv_res.input.file.name,
|
||||||
format=InputFormat.MD,
|
format=InputFormat.HTML,
|
||||||
backend=HTMLDocumentBackend,
|
backend=HTMLDocumentBackend,
|
||||||
)
|
)
|
||||||
backend = HTMLDocumentBackend(
|
backend = HTMLDocumentBackend(
|
||||||
@@ -350,22 +360,13 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
page_doc = backend.convert()
|
page_doc = backend.convert()
|
||||||
|
|
||||||
if page.image is not None:
|
# Modify provenance in place for all items in the page document
|
||||||
pg_width = page.image.width
|
for item, level in page_doc.iterate_items(
|
||||||
pg_height = page.image.height
|
with_groups=True,
|
||||||
else:
|
traverse_pictures=True,
|
||||||
pg_width = 1
|
included_content_layers=set(ContentLayer),
|
||||||
pg_height = 1
|
):
|
||||||
|
if isinstance(item, DocItem):
|
||||||
conv_res.document.add_page(
|
|
||||||
page_no=page_no,
|
|
||||||
size=Size(width=pg_width, height=pg_height),
|
|
||||||
image=ImageRef.from_pil(image=page.image, dpi=72)
|
|
||||||
if page.image
|
|
||||||
else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
for item, level in page_doc.iterate_items():
|
|
||||||
item.prov = [
|
item.prov = [
|
||||||
ProvenanceItem(
|
ProvenanceItem(
|
||||||
page_no=pg_idx + 1,
|
page_no=pg_idx + 1,
|
||||||
@@ -375,9 +376,28 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
charspan=[0, 0],
|
charspan=[0, 0],
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
conv_res.document.append_child_item(child=item)
|
|
||||||
|
|
||||||
return conv_res.document
|
# Add page metadata to the page document before concatenation
|
||||||
|
if page.image is not None:
|
||||||
|
pg_width = page.image.width
|
||||||
|
pg_height = page.image.height
|
||||||
|
else:
|
||||||
|
pg_width = 1
|
||||||
|
pg_height = 1
|
||||||
|
|
||||||
|
page_doc.add_page(
|
||||||
|
page_no=pg_idx + 1,
|
||||||
|
size=Size(width=pg_width, height=pg_height),
|
||||||
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
||||||
|
if page.image
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
page_docs.append(page_doc)
|
||||||
|
|
||||||
|
# Concatenate all page documents to preserve hierarchy
|
||||||
|
final_doc = DoclingDocument.concatenate(docs=page_docs)
|
||||||
|
return final_doc
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_default_options(cls) -> VlmPipelineOptions:
|
def get_default_options(cls) -> VlmPipelineOptions:
|
||||||
|
|||||||
Reference in New Issue
Block a user