feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s

* Add DoclingParseV3 backend implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use docling-core with docling-parse types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes and test updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test units

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add back DoclingParse v1 backend, pipeline options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update locks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update docling-core to 2.22.0

Update dependency library docling-core to latest release 2.22.0
Fix regression tests and ground truth files

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Ground-truth files updated

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update tests, use TextCell.from_ocr property

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Text fixes, new test data

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename docling backend to v4

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Test all backends, fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset all tests to use docling-parse v1 for now

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* test_input_doc use default backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2025-03-18 10:38:19 +01:00
committed by GitHub
parent 772487f9c9
commit 3960b199d6
126 changed files with 1138 additions and 709 deletions

View File

@@ -6,11 +6,12 @@ from typing import Iterable, List
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
@@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
p.dimension = 2
idx = index.Index(properties=p)
for i, cell in enumerate(programmatic_cells):
idx.insert(i, cell.bbox.as_tuple())
idx.insert(i, cell.rect.to_bounding_box().as_tuple())
def is_overlapping_with_existing_cells(ocr_cell):
# Query the R-tree to get overlapping rectangles
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
possible_matches_index = list(
idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
)
return (
len(possible_matches_index) > 0
@@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
"""
if self.options.force_full_page_ocr:
# If a full page OCR is forced, use only the OCR cells
cells = [
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
for c_ocr in ocr_cells
]
cells = ocr_cells
return cells
## Remove OCR cells which overlap with programmatic cells.
@@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):
# Draw OCR and programmatic cells
for tc in page.cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
@@ -165,9 +165,8 @@ class BaseOcrModel(BasePageModel):
if y1 <= y0:
y1, y0 = y0, y1
color = "gray"
if isinstance(tc, OcrCell):
color = "magenta"
color = "magenta" if tc.from_ocr else "gray"
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
if show:

View File

@@ -6,8 +6,9 @@ from typing import Iterable, List, Optional
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
@@ -148,18 +149,22 @@ class EasyOcrModel(BaseOcrModel):
del im
cells = [
OcrCell(
id=ix,
TextCell(
index=ix,
text=line[1],
orig=line[1],
from_ocr=True,
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
)
),
)
for ix, line in enumerate(result)

View File

@@ -3,8 +3,9 @@ import tempfile
from typing import Iterable, Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrMacOptions
from docling.datamodel.settings import settings
@@ -94,13 +95,17 @@ class OcrMacModel(BaseOcrModel):
bottom = y2 / self.scale
cells.append(
OcrCell(
id=ix,
TextCell(
index=ix,
text=text,
orig=text,
from_ocr=True,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
)
),
)
)

View File

@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel):
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):
page.cells = list(page._backend.get_text_cells())
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
# DEBUG code:
def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image)

View File

@@ -3,8 +3,9 @@ from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
@@ -100,18 +101,26 @@ class RapidOcrModel(BaseOcrModel):
if result is not None:
cells = [
OcrCell(
id=ix,
TextCell(
index=ix,
text=line[1],
orig=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
from_ocr=True,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale)
+ ocr_rect.l,
(line[0][0][1] / self.scale)
+ ocr_rect.t,
(line[0][2][0] / self.scale)
+ ocr_rect.l,
(line[0][2][1] / self.scale)
+ ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
)
),
)
for ix, line in enumerate(result)

View File

@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_core.types.doc.page import BoundingRectangle
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
for cell in table_element.cluster.cells:
x0, y0, x1, y1 = cell.bbox.as_tuple()
x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.bbox = new_cell.bbox.scaled(
scale=self.scale
new_cell.rect = BoundingRectangle.from_bounding_box(
new_cell.rect.to_bounding_box().scaled(
scale=self.scale
)
)
tokens.append(new_cell.model_dump())
tokens.append(
{
"id": new_cell.index,
"text": new_cell.text,
"bbox": new_cell.rect.to_bounding_box().model_dump(),
}
)
page_input["tokens"] = tokens
tf_output = self.tf_predictor.multi_table_predict(

View File

@@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
@@ -228,18 +229,22 @@ class TesseractOcrCliModel(BaseOcrModel):
t = b + h
r = l + w
cell = OcrCell(
id=ix,
cell = TextCell(
index=ix,
text=text,
orig=text,
from_ocr=True,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
)
),
)
all_ocr_cells.append(cell)

View File

@@ -2,8 +2,9 @@ import logging
from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
@@ -173,13 +174,17 @@ class TesseractOcrModel(BaseOcrModel):
top = (box["y"] + box["h"]) / self.scale
cells.append(
OcrCell(
id=ix,
TextCell(
index=ix,
text=text,
orig=text,
from_ocr=True,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
),
)
)