mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 13:18:24 +00:00
feat: new torch-based docling models (#120)
--------- Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
@@ -67,7 +67,10 @@ class DocumentConverter:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.0.0",
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
@@ -33,6 +33,7 @@ class LayoutModel:
|
||||
"Page-footer",
|
||||
"Code",
|
||||
"List-item",
|
||||
# "Title"
|
||||
# "Formula",
|
||||
]
|
||||
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
|
||||
@@ -69,9 +70,7 @@ class LayoutModel:
|
||||
"Key-Value Region": 0.45,
|
||||
}
|
||||
|
||||
CLASS_REMAPPINGS = {
|
||||
"Document Index": "Table",
|
||||
}
|
||||
CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
|
||||
|
||||
_log.debug("================= Start postprocess function ====================")
|
||||
start_time = time.time()
|
||||
@@ -277,6 +276,7 @@ class LayoutModel:
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
cells=[],
|
||||
)
|
||||
|
||||
clusters.append(cluster)
|
||||
|
||||
# Map cells to clusters
|
||||
|
||||
@@ -8,7 +8,7 @@ from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
|
||||
|
||||
class StandardModelPipeline(BaseModelPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
|
||||
Reference in New Issue
Block a user