feat: support xlsm files (#1520)

* code for xlsm support

* updated support for xlsm

* updated code for xlsm support

* Update docling_parse_v4_backend.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* Update docling_parse_v4_backend.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* Update test_backend_msexcel_xlsm.py

 updated the tests/test_backend_msexcel_xlsm.py:

 have a function starting with test
removed all print statements
** To add an explicit assert {test}=={pred}

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* Update base_models.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* Update test_backend_msexcel.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* Update test_backend_msexcel_xlsm.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* Update document_converter.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* Delete tests/test_backend_msexcel_xlsm.py

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* xlsm file

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>

* run tests

* ran tests

* Fix tests, upgrade XSLM example to a valid file

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Ayraf
2025-06-10 20:25:59 +05:30
committed by GitHub
parent 6613b9e98b
commit df140227c3
19 changed files with 4834 additions and 632 deletions

View File

@@ -1,2 +1,2 @@
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
<doctag><text><loc_59><loc_46><loc_424><loc_90>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag>

View File

@@ -42,10 +42,10 @@
{
"page_no": 1,
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 2570.0959833241664,
"r": 1696.0985546594009,
"b": 2315.204273887442,
"r": 1696.0985042090742,
"b": 2319.1220927976665,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [

View File

@@ -40,14 +40,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -65,14 +65,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
@@ -90,13 +90,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 258.9040166758338,
"r": 1696.0985546594009,
"b": 513.795726112558,
"r": 1696.0985042090742,
"b": 509.8779072023336,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9721010327339172,
"confidence": 0.9721011519432068,
"cells": [
{
"index": 0,
@@ -132,14 +132,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -157,14 +157,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
@@ -195,13 +195,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 258.9040166758338,
"r": 1696.0985546594009,
"b": 513.795726112558,
"r": 1696.0985042090742,
"b": 509.8779072023336,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9721010327339172,
"confidence": 0.9721011519432068,
"cells": [
{
"index": 0,
@@ -237,14 +237,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -262,14 +262,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
@@ -293,13 +293,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 258.9040166758338,
"r": 1696.0985546594009,
"b": 513.795726112558,
"r": 1696.0985042090742,
"b": 509.8779072023336,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9721010327339172,
"confidence": 0.9721011519432068,
"cells": [
{
"index": 0,
@@ -335,14 +335,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@@ -360,14 +360,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",