From 8e54299eac0987f5f41efeba253f95511e7219da Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 17 Mar 2025 13:29:07 +0100 Subject: [PATCH] preparing to migrate to new doctags deserializer Signed-off-by: Maksym Lysak --- docling/pipeline/vlm_pipeline.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 11273564..0b7a038b 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Optional from docling_core.types import DoclingDocument +from docling_core.types.doc.document import DocTagsDocument from docling_core.types.doc import ( BoundingBox, DocItem, @@ -108,6 +109,11 @@ class VlmPipeline(PaginatedPipeline): == ResponseFormat.DOCTAGS ): conv_res.document = self._turn_tags_into_doc(conv_res.pages) + # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) + # conv_res.document.load_from_doctags(doctags_doc) + # USE THIS TO FORCE BACKEND TEXT + # if self.force_backend_text: + # text_content = extract_text_from_backend(page, bbox) elif ( self.pipeline_options.vlm_options.response_format == ResponseFormat.MARKDOWN