mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: [Beta] Extraction with schema (#2138)
* Add DocumentConverter.extract and full extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentConverter.extract template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add NuExtract model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add Extraction pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add proper test, support pydantic class types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add qr bill example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add base_extraction_pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update typing of ExtractionResult and inner fields Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Factor out extract to DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Address mypy issues Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add DocumentExtractor Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Resolve circular import issue Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports, remove Optional for template arg Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move new type definitions into datamodel Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update comments Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Respect page-range, disable test_extraction for CI Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -71,6 +71,7 @@ dependencies = [
|
||||
# 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
|
||||
# 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
|
||||
"accelerate>=1.0.0,<2",
|
||||
"polyfactory>=2.22.2",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -94,6 +95,7 @@ vlm = [
|
||||
'accelerate (>=1.2.1,<2.0.0)',
|
||||
'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
||||
'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64"',
|
||||
"qwen-vl-utils>=0.0.11",
|
||||
]
|
||||
rapidocr = [
|
||||
'rapidocr (>=3.3,<4.0.0) ; python_version < "3.14"',
|
||||
@@ -255,6 +257,7 @@ module = [
|
||||
"transformers.*",
|
||||
"pylatexenc.*",
|
||||
"vllm.*",
|
||||
"qwen_vl_utils.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
Reference in New Issue
Block a user