feat: updated the backend for new docling-parse (#2187)

* updated the backend and pyproject.toml

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the version and test files

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the lock

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* forgot to add 1 updated test-file

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the lock

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-09-05 10:42:31 +02:00
committed by GitHub
parent 2c3f6faf3d
commit b3d7542061
7 changed files with 826 additions and 851 deletions

View File

@@ -47,8 +47,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
seg_page = self._dp_doc.get_page(
self._page_no + 1,
keep_chars=True,
keep_lines=True,
keep_bitmaps=True,
create_words=self._create_words,
create_textlines=self._create_textlines,
enforce_same_font=True,
)
# In Docling, all TextCell instances are expected with top-left origin.

View File

@@ -45,7 +45,7 @@ requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.42.0,<3.0.0)',
'docling-parse (>=4.2.2,<5.0.0)',
'docling-parse (>=4.4.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',
'pypdfium2 (>=4.30.0,!=4.30.1,<5.0.0)',

View File

@@ -1145,7 +1145,7 @@
},
"charspan": [
0,
352
351
]
}
],

View File

@@ -829777,16 +829777,16 @@
"rect": {
"r_x0": 148.13,
"r_y0": 273.52,
"r_x1": 167.52,
"r_x1": 199.78,
"r_y1": 273.52,
"r_x2": 167.52,
"r_x2": 199.78,
"r_y2": 264.36,
"r_x3": 148.13,
"r_y3": 264.36,
"coord_origin": "TOPLEFT"
},
"text": "FOR",
"orig": "FOR",
"text": "FORROWS",
"orig": "FORROWS",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
@@ -829803,35 +829803,6 @@
"b": 0,
"a": 255
},
"rect": {
"r_x0": 169.98,
"r_y0": 273.52,
"r_x1": 199.78,
"r_y1": 273.52,
"r_x2": 199.78,
"r_y2": 264.36,
"r_x3": 169.98,
"r_y3": 264.36,
"coord_origin": "TOPLEFT"
},
"text": "ROWS",
"orig": "ROWS",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
"rendering_mode": -1,
"widget": false,
"font_key": "/F20",
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 141,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 321.56,
"r_y0": 253.86,
@@ -829854,7 +829825,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 142,
"index": 141,
"rgba": {
"r": 0,
"g": 0,
@@ -829883,7 +829854,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 143,
"index": 142,
"rgba": {
"r": 0,
"g": 0,
@@ -829912,7 +829883,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 144,
"index": 143,
"rgba": {
"r": 0,
"g": 0,
@@ -829941,7 +829912,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 145,
"index": 144,
"rgba": {
"r": 0,
"g": 0,
@@ -829970,7 +829941,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 146,
"index": 145,
"rgba": {
"r": 0,
"g": 0,
@@ -829999,7 +829970,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 147,
"index": 146,
"rgba": {
"r": 0,
"g": 0,
@@ -830028,7 +829999,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 148,
"index": 147,
"rgba": {
"r": 0,
"g": 0,
@@ -830057,7 +830028,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 149,
"index": 148,
"rgba": {
"r": 0,
"g": 0,
@@ -830086,7 +830057,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 150,
"index": 149,
"rgba": {
"r": 0,
"g": 0,
@@ -830115,7 +830086,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 151,
"index": 150,
"rgba": {
"r": 0,
"g": 0,
@@ -830144,7 +830115,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 152,
"index": 151,
"rgba": {
"r": 0,
"g": 0,
@@ -830173,7 +830144,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 153,
"index": 152,
"rgba": {
"r": 0,
"g": 0,
@@ -830202,7 +830173,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 154,
"index": 153,
"rgba": {
"r": 0,
"g": 0,
@@ -830231,7 +830202,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 155,
"index": 154,
"rgba": {
"r": 0,
"g": 0,
@@ -830260,7 +830231,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 156,
"index": 155,
"rgba": {
"r": 0,
"g": 0,
@@ -830289,7 +830260,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 157,
"index": 156,
"rgba": {
"r": 0,
"g": 0,
@@ -830318,7 +830289,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 158,
"index": 157,
"rgba": {
"r": 0,
"g": 0,
@@ -830347,7 +830318,7 @@
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 159,
"index": 158,
"rgba": {
"r": 0,
"g": 0,
@@ -830376,7 +830347,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 160,
"index": 159,
"rgba": {
"r": 0,
"g": 0,
@@ -830405,7 +830376,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 161,
"index": 160,
"rgba": {
"r": 0,
"g": 0,
@@ -830434,7 +830405,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 162,
"index": 161,
"rgba": {
"r": 0,
"g": 0,
@@ -830463,7 +830434,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 163,
"index": 162,
"rgba": {
"r": 0,
"g": 0,
@@ -830492,7 +830463,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 164,
"index": 163,
"rgba": {
"r": 0,
"g": 0,
@@ -830521,7 +830492,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 165,
"index": 164,
"rgba": {
"r": 0,
"g": 0,
@@ -830550,7 +830521,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 166,
"index": 165,
"rgba": {
"r": 0,
"g": 0,
@@ -830579,7 +830550,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 167,
"index": 166,
"rgba": {
"r": 0,
"g": 0,
@@ -830608,7 +830579,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 168,
"index": 167,
"rgba": {
"r": 0,
"g": 0,
@@ -830637,7 +830608,7 @@
"font_name": "/NKEFHD+Calibri,Italic"
},
{
"index": 169,
"index": 168,
"rgba": {
"r": 0,
"g": 0,
@@ -830666,7 +830637,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 170,
"index": 169,
"rgba": {
"r": 0,
"g": 0,
@@ -830695,7 +830666,7 @@
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 171,
"index": 170,
"rgba": {
"r": 0,
"g": 0,
@@ -830724,7 +830695,7 @@
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 172,
"index": 171,
"rgba": {
"r": 0,
"g": 0,
@@ -830753,7 +830724,7 @@
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 173,
"index": 172,
"rgba": {
"r": 0,
"g": 0,
@@ -830782,7 +830753,7 @@
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 174,
"index": 173,
"rgba": {
"r": 0,
"g": 0,
@@ -830811,7 +830782,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 175,
"index": 174,
"rgba": {
"r": 0,
"g": 0,
@@ -830840,7 +830811,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 176,
"index": 175,
"rgba": {
"r": 0,
"g": 0,
@@ -830869,7 +830840,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 177,
"index": 176,
"rgba": {
"r": 0,
"g": 0,
@@ -830898,7 +830869,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 178,
"index": 177,
"rgba": {
"r": 0,
"g": 0,
@@ -830927,7 +830898,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 179,
"index": 178,
"rgba": {
"r": 0,
"g": 0,
@@ -830956,7 +830927,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 180,
"index": 179,
"rgba": {
"r": 0,
"g": 0,
@@ -830985,7 +830956,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 181,
"index": 180,
"rgba": {
"r": 0,
"g": 0,
@@ -831014,7 +830985,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 182,
"index": 181,
"rgba": {
"r": 0,
"g": 0,
@@ -831043,7 +831014,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 183,
"index": 182,
"rgba": {
"r": 0,
"g": 0,
@@ -831072,7 +831043,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 184,
"index": 183,
"rgba": {
"r": 0,
"g": 0,
@@ -831101,7 +831072,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 185,
"index": 184,
"rgba": {
"r": 0,
"g": 0,
@@ -831130,7 +831101,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 186,
"index": 185,
"rgba": {
"r": 0,
"g": 0,
@@ -831159,7 +831130,7 @@
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 187,
"index": 186,
"rgba": {
"r": 0,
"g": 0,
@@ -831188,7 +831159,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 188,
"index": 187,
"rgba": {
"r": 0,
"g": 0,
@@ -831217,7 +831188,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 189,
"index": 188,
"rgba": {
"r": 0,
"g": 0,
@@ -831246,7 +831217,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 190,
"index": 189,
"rgba": {
"r": 0,
"g": 0,
@@ -831275,7 +831246,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 191,
"index": 190,
"rgba": {
"r": 0,
"g": 0,
@@ -831304,7 +831275,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 192,
"index": 191,
"rgba": {
"r": 0,
"g": 0,
@@ -831333,7 +831304,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 193,
"index": 192,
"rgba": {
"r": 0,
"g": 0,
@@ -831362,7 +831333,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 194,
"index": 193,
"rgba": {
"r": 0,
"g": 0,
@@ -831391,7 +831362,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 195,
"index": 194,
"rgba": {
"r": 0,
"g": 0,
@@ -831420,7 +831391,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 196,
"index": 195,
"rgba": {
"r": 0,
"g": 0,
@@ -831449,7 +831420,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 197,
"index": 196,
"rgba": {
"r": 0,
"g": 0,
@@ -831478,7 +831449,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 198,
"index": 197,
"rgba": {
"r": 0,
"g": 0,
@@ -831507,7 +831478,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 199,
"index": 198,
"rgba": {
"r": 0,
"g": 0,
@@ -831536,7 +831507,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 200,
"index": 199,
"rgba": {
"r": 0,
"g": 0,
@@ -831565,7 +831536,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 201,
"index": 200,
"rgba": {
"r": 0,
"g": 0,
@@ -831594,7 +831565,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 202,
"index": 201,
"rgba": {
"r": 0,
"g": 0,
@@ -831623,7 +831594,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 203,
"index": 202,
"rgba": {
"r": 0,
"g": 0,
@@ -831652,7 +831623,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 204,
"index": 203,
"rgba": {
"r": 0,
"g": 0,
@@ -831681,7 +831652,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 205,
"index": 204,
"rgba": {
"r": 0,
"g": 0,
@@ -831710,7 +831681,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 206,
"index": 205,
"rgba": {
"r": 0,
"g": 0,
@@ -831739,7 +831710,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 207,
"index": 206,
"rgba": {
"r": 0,
"g": 0,
@@ -831768,7 +831739,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 208,
"index": 207,
"rgba": {
"r": 0,
"g": 0,
@@ -831797,7 +831768,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 209,
"index": 208,
"rgba": {
"r": 0,
"g": 0,
@@ -831826,7 +831797,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 210,
"index": 209,
"rgba": {
"r": 0,
"g": 0,
@@ -831855,7 +831826,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 211,
"index": 210,
"rgba": {
"r": 0,
"g": 0,
@@ -831884,7 +831855,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 212,
"index": 211,
"rgba": {
"r": 0,
"g": 0,
@@ -831913,7 +831884,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 213,
"index": 212,
"rgba": {
"r": 0,
"g": 0,
@@ -831942,7 +831913,7 @@
"font_name": "/NKEFHC+Calibri,Bold"
},
{
"index": 214,
"index": 213,
"rgba": {
"r": 0,
"g": 0,
@@ -831971,7 +831942,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 215,
"index": 214,
"rgba": {
"r": 0,
"g": 0,
@@ -832000,7 +831971,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 216,
"index": 215,
"rgba": {
"r": 0,
"g": 0,
@@ -832029,7 +832000,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 217,
"index": 216,
"rgba": {
"r": 0,
"g": 0,
@@ -832058,7 +832029,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 218,
"index": 217,
"rgba": {
"r": 0,
"g": 0,
@@ -832087,7 +832058,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 219,
"index": 218,
"rgba": {
"r": 0,
"g": 0,
@@ -832116,7 +832087,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 220,
"index": 219,
"rgba": {
"r": 0,
"g": 0,
@@ -832145,7 +832116,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 221,
"index": 220,
"rgba": {
"r": 0,
"g": 0,
@@ -832174,7 +832145,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 222,
"index": 221,
"rgba": {
"r": 0,
"g": 0,
@@ -832203,7 +832174,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 223,
"index": 222,
"rgba": {
"r": 0,
"g": 0,
@@ -832232,7 +832203,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 224,
"index": 223,
"rgba": {
"r": 0,
"g": 0,
@@ -832261,7 +832232,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 225,
"index": 224,
"rgba": {
"r": 0,
"g": 0,
@@ -832290,7 +832261,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 226,
"index": 225,
"rgba": {
"r": 0,
"g": 0,
@@ -832319,7 +832290,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 227,
"index": 226,
"rgba": {
"r": 0,
"g": 0,
@@ -832348,7 +832319,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 228,
"index": 227,
"rgba": {
"r": 0,
"g": 0,
@@ -832377,7 +832348,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 229,
"index": 228,
"rgba": {
"r": 0,
"g": 0,
@@ -832406,7 +832377,7 @@
"font_name": "/NKEDOI+Calibri"
},
{
"index": 230,
"index": 229,
"rgba": {
"r": 0,
"g": 0,

1426
uv.lock generated

File diff suppressed because it is too large Load Diff