mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-16 16:48:21 +00:00
feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -6,11 +6,12 @@ from typing import Iterable, List
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import binary_dilation, find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
|
||||
p.dimension = 2
|
||||
idx = index.Index(properties=p)
|
||||
for i, cell in enumerate(programmatic_cells):
|
||||
idx.insert(i, cell.bbox.as_tuple())
|
||||
idx.insert(i, cell.rect.to_bounding_box().as_tuple())
|
||||
|
||||
def is_overlapping_with_existing_cells(ocr_cell):
|
||||
# Query the R-tree to get overlapping rectangles
|
||||
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
|
||||
possible_matches_index = list(
|
||||
idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
|
||||
)
|
||||
|
||||
return (
|
||||
len(possible_matches_index) > 0
|
||||
@@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
|
||||
"""
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
cells = [
|
||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||
for c_ocr in ocr_cells
|
||||
]
|
||||
cells = ocr_cells
|
||||
return cells
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
@@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):
|
||||
|
||||
# Draw OCR and programmatic cells
|
||||
for tc in page.cells:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
|
||||
y0 *= scale_x
|
||||
y1 *= scale_y
|
||||
x0 *= scale_x
|
||||
@@ -165,9 +165,8 @@ class BaseOcrModel(BasePageModel):
|
||||
if y1 <= y0:
|
||||
y1, y0 = y0, y1
|
||||
|
||||
color = "gray"
|
||||
if isinstance(tc, OcrCell):
|
||||
color = "magenta"
|
||||
color = "magenta" if tc.from_ocr else "gray"
|
||||
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
||||
|
||||
if show:
|
||||
|
||||
@@ -6,8 +6,9 @@ from typing import Iterable, List, Optional
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@@ -148,18 +149,22 @@ class EasyOcrModel(BaseOcrModel):
|
||||
del im
|
||||
|
||||
cells = [
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=line[1],
|
||||
orig=line[1],
|
||||
from_ocr=True,
|
||||
confidence=line[2],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
|
||||
@@ -3,8 +3,9 @@ import tempfile
|
||||
from typing import Iterable, Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -94,13 +95,17 @@ class OcrMacModel(BaseOcrModel):
|
||||
bottom = y2 / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=text,
|
||||
orig=text,
|
||||
from_ocr=True,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
images_scale: Optional[float]
|
||||
create_parsed_page: bool
|
||||
|
||||
|
||||
class PagePreprocessingModel(BasePageModel):
|
||||
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
if self.options.create_parsed_page:
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells, show: bool = False):
|
||||
draw = ImageDraw.Draw(image)
|
||||
|
||||
@@ -3,8 +3,9 @@ from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@@ -100,18 +101,26 @@ class RapidOcrModel(BaseOcrModel):
|
||||
|
||||
if result is not None:
|
||||
cells = [
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=line[1],
|
||||
orig=line[1],
|
||||
confidence=line[2],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
from_ocr=True,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale)
|
||||
+ ocr_rect.l,
|
||||
(line[0][0][1] / self.scale)
|
||||
+ ocr_rect.t,
|
||||
(line[0][2][0] / self.scale)
|
||||
+ ocr_rect.l,
|
||||
(line[0][2][1] / self.scale)
|
||||
+ ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
|
||||
for cell in table_element.cluster.cells:
|
||||
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
||||
x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
|
||||
x0 *= scale_x
|
||||
x1 *= scale_x
|
||||
y0 *= scale_x
|
||||
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
new_cell = copy.deepcopy(c)
|
||||
new_cell.bbox = new_cell.bbox.scaled(
|
||||
scale=self.scale
|
||||
new_cell.rect = BoundingRectangle.from_bounding_box(
|
||||
new_cell.rect.to_bounding_box().scaled(
|
||||
scale=self.scale
|
||||
)
|
||||
)
|
||||
|
||||
tokens.append(new_cell.model_dump())
|
||||
tokens.append(
|
||||
{
|
||||
"id": new_cell.index,
|
||||
"text": new_cell.text,
|
||||
"bbox": new_cell.rect.to_bounding_box().model_dump(),
|
||||
}
|
||||
)
|
||||
page_input["tokens"] = tokens
|
||||
|
||||
tf_output = self.tf_predictor.multi_table_predict(
|
||||
|
||||
@@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -228,18 +229,22 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
t = b + h
|
||||
r = l + w
|
||||
|
||||
cell = OcrCell(
|
||||
id=ix,
|
||||
cell = TextCell(
|
||||
index=ix,
|
||||
text=text,
|
||||
orig=text,
|
||||
from_ocr=True,
|
||||
confidence=conf / 100.0,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
@@ -2,8 +2,9 @@ import logging
|
||||
from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -173,13 +174,17 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
top = (box["y"] + box["h"]) / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=text,
|
||||
orig=text,
|
||||
from_ocr=True,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user