mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
propagated changes for new CodeItem class
Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>
This commit is contained in:
parent
57fc28d3d8
commit
6048f8ac14
@ -40,7 +40,7 @@ class LayoutModel(BasePageModel):
|
|||||||
DocItemLabel.PAGE_FOOTER,
|
DocItemLabel.PAGE_FOOTER,
|
||||||
DocItemLabel.CODE,
|
DocItemLabel.CODE,
|
||||||
DocItemLabel.LIST_ITEM,
|
DocItemLabel.LIST_ITEM,
|
||||||
# "Formula",
|
DocItemLabel.FORMULA,
|
||||||
]
|
]
|
||||||
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
||||||
|
|
||||||
|
@ -135,31 +135,6 @@ class PageAssembleModel(BasePageModel):
|
|||||||
)
|
)
|
||||||
elements.append(fig)
|
elements.append(fig)
|
||||||
body.append(fig)
|
body.append(fig)
|
||||||
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
|
||||||
equation = None
|
|
||||||
if page.predictions.equations_prediction:
|
|
||||||
equation = page.predictions.equations_prediction.equation_map.get(
|
|
||||||
cluster.id, None
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
not equation
|
|
||||||
): # fallback: add empty formula, if it isn't present
|
|
||||||
text = self.sanitize_text(
|
|
||||||
[
|
|
||||||
cell.text.replace("\x02", "-").strip()
|
|
||||||
for cell in cluster.cells
|
|
||||||
if len(cell.text.strip()) > 0
|
|
||||||
]
|
|
||||||
)
|
|
||||||
equation = TextElement(
|
|
||||||
label=cluster.label,
|
|
||||||
id=cluster.id,
|
|
||||||
cluster=cluster,
|
|
||||||
page_no=page.page_no,
|
|
||||||
text=text,
|
|
||||||
)
|
|
||||||
elements.append(equation)
|
|
||||||
body.append(equation)
|
|
||||||
elif cluster.label in LayoutModel.CONTAINER_LABELS:
|
elif cluster.label in LayoutModel.CONTAINER_LABELS:
|
||||||
container_el = ContainerElement(
|
container_el = ContainerElement(
|
||||||
label=cluster.label,
|
label=cluster.label,
|
||||||
|
@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|||||||
container_el = doc.add_group(label=group_label)
|
container_el = doc.add_group(label=group_label)
|
||||||
|
|
||||||
_add_child_elements(container_el, doc, obj, pelem)
|
_add_child_elements(container_el, doc, obj, pelem)
|
||||||
|
|
||||||
elif "text" in obj:
|
elif "text" in obj:
|
||||||
text = obj["text"][span_i:span_j]
|
text = obj["text"][span_i:span_j]
|
||||||
|
|
||||||
@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|||||||
current_list = None
|
current_list = None
|
||||||
|
|
||||||
doc.add_heading(text=text, prov=prov)
|
doc.add_heading(text=text, prov=prov)
|
||||||
|
elif label == DocItemLabel.CODE:
|
||||||
|
current_list = None
|
||||||
|
|
||||||
|
doc.add_code(text=text, prov=prov)
|
||||||
else:
|
else:
|
||||||
current_list = None
|
current_list = None
|
||||||
|
|
||||||
|
173
docs/examples/develop_code_equation_enrichment.py
Normal file
173
docs/examples/develop_code_equation_enrichment.py
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterable, Literal
|
||||||
|
|
||||||
|
from docling_core.types.doc import (
|
||||||
|
DoclingDocument,
|
||||||
|
NodeItem,
|
||||||
|
TextItem,
|
||||||
|
)
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import AcceleratorOptions, PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.models.base_model import BaseEnrichmentModel
|
||||||
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
|
from docling_ibm_models.code_formula_model.code_formula_predictor import (
|
||||||
|
CodeFormulaPredictor,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
|
||||||
|
# TODO: remove this. Imported so that the models are registered
|
||||||
|
from docling_ibm_models.code_formula_model.models.vary_opt import *
|
||||||
|
from docling_ibm_models.code_formula_model.models.vary_opt_image_processor import *
|
||||||
|
|
||||||
|
|
||||||
|
class CodeFormulaMode(str, Enum):
|
||||||
|
"""Modes for the CodeFormula model."""
|
||||||
|
|
||||||
|
CODE = "code"
|
||||||
|
FORMULA = "formula"
|
||||||
|
CODE_FORMULA = "code_formula"
|
||||||
|
|
||||||
|
|
||||||
|
class CodeFormulaModelOptions(BaseModel):
|
||||||
|
kind: Literal["code_formula"] = "code_formula"
|
||||||
|
|
||||||
|
mode: CodeFormulaMode = CodeFormulaMode.CODE_FORMULA
|
||||||
|
|
||||||
|
|
||||||
|
class CodeFormulaModel(BaseEnrichmentModel):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
enabled: bool,
|
||||||
|
artifacts_path: Path,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
code_formula_options: CodeFormulaModelOptions,
|
||||||
|
):
|
||||||
|
"""Init the CodeFormulaModel.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enabled (bool): True if the model is enabled, False othewise.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.enabled = enabled
|
||||||
|
self.mode = code_formula_options.mode
|
||||||
|
|
||||||
|
self.code_formula_model = CodeFormulaPredictor(
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
device=accelerator_options.device,
|
||||||
|
num_threads=accelerator_options.num_threads,
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
|
return (
|
||||||
|
self.enabled
|
||||||
|
and isinstance(element, TextItem)
|
||||||
|
and (
|
||||||
|
(
|
||||||
|
element.label == "code"
|
||||||
|
and (
|
||||||
|
CodeFormulaMode.CODE
|
||||||
|
or self.mode == CodeFormulaMode.CODE_FORMULA
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
element.label == "formula"
|
||||||
|
and (
|
||||||
|
self.mode == CodeFormulaMode.FORMULA
|
||||||
|
or self.mode == CodeFormulaMode.CODE_FORMULA
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||||
|
) -> Iterable[Any]:
|
||||||
|
print(len(element_batch))
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
# ! TODO: batch size missing
|
||||||
|
images = [el.get_image(doc) for el in element_batch]
|
||||||
|
labels = [el.label for el in element_batch]
|
||||||
|
|
||||||
|
outputs = self.code_formula_model.predict(images, labels)
|
||||||
|
# for output in outputs:
|
||||||
|
# print(output)
|
||||||
|
# print("\n\n\n\n\n")
|
||||||
|
|
||||||
|
for element, output in zip(element_batch, outputs):
|
||||||
|
element.text = output
|
||||||
|
|
||||||
|
yield element_batch
|
||||||
|
|
||||||
|
|
||||||
|
class CodeFormulaPipelineOptions(PdfPipelineOptions):
|
||||||
|
do_code_formula_enrichment: bool = True
|
||||||
|
|
||||||
|
class CodeFormulaPipeline(StandardPdfPipeline):
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: CodeFormulaPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
self.pipeline_options: CodeFormulaPipelineOptions
|
||||||
|
|
||||||
|
self.enrichment_pipe = [
|
||||||
|
CodeFormulaModel(
|
||||||
|
enabled=pipeline_options.do_code_formula_enrichment,
|
||||||
|
artifacts_path="/dccstor/doc_fig_class/DocFM-Vision-Pretrainer/Vary-master/checkpoints_code_equation_model/best_run",
|
||||||
|
accelerator_options=AcceleratorOptions(device="cpu"),
|
||||||
|
code_formula_options=CodeFormulaModelOptions(),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls) -> CodeFormulaPipelineOptions:
|
||||||
|
return CodeFormulaPipelineOptions()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
# input_doc_path = Path("./tests/data/code_and_formulas.pdf")
|
||||||
|
input_doc_path = Path(
|
||||||
|
"/dccstor/doc_fig_class/docling-ibm/test/data/pdf/code_and_formulas.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
settings.debug.visualize_raw_layout = True
|
||||||
|
settings.debug.visualize_layout = True
|
||||||
|
settings.debug.visualize_ocr = True
|
||||||
|
settings.debug.visualize_tables = True
|
||||||
|
|
||||||
|
pipeline_options = CodeFormulaPipelineOptions()
|
||||||
|
pipeline_options.images_scale = 2.0
|
||||||
|
|
||||||
|
pipeline_options.generate_page_images = True
|
||||||
|
pipeline_options.generate_picture_images = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=CodeFormulaPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
result = doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
|
for element, _level in result.document.iterate_items():
|
||||||
|
if isinstance(element, TextItem) and (element.label == "code" or element.label == "formula"):
|
||||||
|
print(
|
||||||
|
f"The model populated the `text` portion of the TextElement {element.self_ref}:\n{element.text}\n\n\n\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
BIN
tests/data/code_and_formulas.pdf
Normal file
BIN
tests/data/code_and_formulas.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user