feat: new torch-based docling models (#120)

---------

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2024-10-03 18:42:33 +02:00
committed by GitHub
parent 9ebbbc1245
commit 2422f706a1
30 changed files with 1159 additions and 1185 deletions

View File

@@ -67,7 +67,10 @@ class DocumentConverter:
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.0.0",
)
return Path(download_path)

View File

@@ -33,6 +33,7 @@ class LayoutModel:
"Page-footer",
"Code",
"List-item",
# "Title"
# "Formula",
]
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
@@ -69,9 +70,7 @@ class LayoutModel:
"Key-Value Region": 0.45,
}
CLASS_REMAPPINGS = {
"Document Index": "Table",
}
CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
_log.debug("================= Start postprocess function ====================")
start_time = time.time()
@@ -277,6 +276,7 @@ class LayoutModel:
bbox=BoundingBox.model_validate(pred_item),
cells=[],
)
clusters.append(cluster)
# Map cells to clusters

View File

@@ -8,7 +8,7 @@ from docling.pipeline.base_model_pipeline import BaseModelPipeline
class StandardModelPipeline(BaseModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):