mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Add back DoclingParse v1 backend, pipeline options
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
8a45a2cafa
commit
9ebd7108f2
234
docling/backend/docling_parse_backend.py
Normal file
234
docling/backend/docling_parse_backend.py
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
import logging
|
||||||
|
import random
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Union
|
||||||
|
|
||||||
|
import pypdfium2 as pdfium
|
||||||
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||||
|
from docling_parse.pdf_parsers import pdf_parser_v1
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DoclingParsePageBackend(PdfPageBackend):
|
||||||
|
def __init__(
|
||||||
|
self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
|
||||||
|
):
|
||||||
|
self._ppage = page_obj
|
||||||
|
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
||||||
|
|
||||||
|
self.valid = "pages" in parsed_page
|
||||||
|
if self.valid:
|
||||||
|
self._dpage = parsed_page["pages"][0]
|
||||||
|
else:
|
||||||
|
_log.info(
|
||||||
|
f"An error occurred when loading page {page_no} of document {document_hash}."
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.valid
|
||||||
|
|
||||||
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||||
|
if not self.valid:
|
||||||
|
return ""
|
||||||
|
# Find intersecting cells on the page
|
||||||
|
text_piece = ""
|
||||||
|
page_size = self.get_size()
|
||||||
|
parser_width = self._dpage["width"]
|
||||||
|
parser_height = self._dpage["height"]
|
||||||
|
|
||||||
|
scale = (
|
||||||
|
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
||||||
|
)
|
||||||
|
|
||||||
|
for i in range(len(self._dpage["cells"])):
|
||||||
|
rect = self._dpage["cells"][i]["box"]["device"]
|
||||||
|
x0, y0, x1, y1 = rect
|
||||||
|
cell_bbox = BoundingBox(
|
||||||
|
l=x0 * scale * page_size.width / parser_width,
|
||||||
|
b=y0 * scale * page_size.height / parser_height,
|
||||||
|
r=x1 * scale * page_size.width / parser_width,
|
||||||
|
t=y1 * scale * page_size.height / parser_height,
|
||||||
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
|
).to_top_left_origin(page_height=page_size.height * scale)
|
||||||
|
|
||||||
|
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
||||||
|
|
||||||
|
if overlap_frac > 0.5:
|
||||||
|
if len(text_piece) > 0:
|
||||||
|
text_piece += " "
|
||||||
|
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
||||||
|
|
||||||
|
return text_piece
|
||||||
|
|
||||||
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_text_cells(self) -> Iterable[TextCell]:
|
||||||
|
cells: List[TextCell] = []
|
||||||
|
cell_counter = 0
|
||||||
|
|
||||||
|
if not self.valid:
|
||||||
|
return cells
|
||||||
|
|
||||||
|
page_size = self.get_size()
|
||||||
|
|
||||||
|
parser_width = self._dpage["width"]
|
||||||
|
parser_height = self._dpage["height"]
|
||||||
|
|
||||||
|
for i in range(len(self._dpage["cells"])):
|
||||||
|
rect = self._dpage["cells"][i]["box"]["device"]
|
||||||
|
x0, y0, x1, y1 = rect
|
||||||
|
|
||||||
|
if x1 < x0:
|
||||||
|
x0, x1 = x1, x0
|
||||||
|
if y1 < y0:
|
||||||
|
y0, y1 = y1, y0
|
||||||
|
|
||||||
|
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||||
|
cells.append(
|
||||||
|
TextCell(
|
||||||
|
index=cell_counter,
|
||||||
|
text=text_piece,
|
||||||
|
orig=text_piece,
|
||||||
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox(
|
||||||
|
# l=x0, b=y0, r=x1, t=y1,
|
||||||
|
l=x0 * page_size.width / parser_width,
|
||||||
|
b=y0 * page_size.height / parser_height,
|
||||||
|
r=x1 * page_size.width / parser_width,
|
||||||
|
t=y1 * page_size.height / parser_height,
|
||||||
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
).to_top_left_origin(page_size.height),
|
||||||
|
)
|
||||||
|
|
||||||
|
cell_counter += 1
|
||||||
|
|
||||||
|
def draw_clusters_and_cells():
|
||||||
|
image = (
|
||||||
|
self.get_page_image()
|
||||||
|
) # make new image to avoid drawing on the saved ones
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for c in cells:
|
||||||
|
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||||
|
cell_color = (
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
)
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# before merge:
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
# cells = merge_horizontal_cells(cells)
|
||||||
|
|
||||||
|
# after merge:
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
return cells
|
||||||
|
|
||||||
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
|
|
||||||
|
for i in range(len(self._dpage["images"])):
|
||||||
|
bitmap = self._dpage["images"][i]
|
||||||
|
cropbox = BoundingBox.from_tuple(
|
||||||
|
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
).to_top_left_origin(self.get_size().height)
|
||||||
|
|
||||||
|
if cropbox.area() > AREA_THRESHOLD:
|
||||||
|
cropbox = cropbox.scaled(scale=scale)
|
||||||
|
|
||||||
|
yield cropbox
|
||||||
|
|
||||||
|
def get_page_image(
|
||||||
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||||
|
) -> Image.Image:
|
||||||
|
|
||||||
|
page_size = self.get_size()
|
||||||
|
|
||||||
|
if not cropbox:
|
||||||
|
cropbox = BoundingBox(
|
||||||
|
l=0,
|
||||||
|
r=page_size.width,
|
||||||
|
t=0,
|
||||||
|
b=page_size.height,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
padbox = BoundingBox(
|
||||||
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
||||||
|
padbox.r = page_size.width - padbox.r
|
||||||
|
padbox.t = page_size.height - padbox.t
|
||||||
|
|
||||||
|
image = (
|
||||||
|
self._ppage.render(
|
||||||
|
scale=scale * 1.5,
|
||||||
|
rotation=0, # no additional rotation
|
||||||
|
crop=padbox.as_tuple(),
|
||||||
|
)
|
||||||
|
.to_pil()
|
||||||
|
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
||||||
|
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||||
|
|
||||||
|
return image
|
||||||
|
|
||||||
|
def get_size(self) -> Size:
|
||||||
|
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
self._ppage = None
|
||||||
|
self._dpage = None
|
||||||
|
|
||||||
|
|
||||||
|
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
||||||
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
|
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||||
|
self.parser = pdf_parser_v1()
|
||||||
|
|
||||||
|
success = False
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
success = self.parser.load_document_from_bytesio(
|
||||||
|
self.document_hash, self.path_or_stream
|
||||||
|
)
|
||||||
|
elif isinstance(self.path_or_stream, Path):
|
||||||
|
success = self.parser.load_document(
|
||||||
|
self.document_hash, str(self.path_or_stream)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"docling-parse could not load document with hash {self.document_hash}."
|
||||||
|
)
|
||||||
|
|
||||||
|
def page_count(self) -> int:
|
||||||
|
return len(self._pdoc) # To be replaced with docling-parse API
|
||||||
|
|
||||||
|
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
||||||
|
return DoclingParsePageBackend(
|
||||||
|
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.page_count() > 0
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
super().unload()
|
||||||
|
self.parser.unload_document(self.document_hash)
|
||||||
|
self._pdoc.close()
|
||||||
|
self._pdoc = None
|
@ -65,9 +65,9 @@ class DoclingParseV3PageBackend(PdfPageBackend):
|
|||||||
for cell in self._dpage.textline_cells:
|
for cell in self._dpage.textline_cells:
|
||||||
rect = cell.rect
|
rect = cell.rect
|
||||||
|
|
||||||
if rect.r_x2 < rect.r_x0:
|
# if rect.r_x2 < rect.r_x0:
|
||||||
rect.r_x0, rect.r_x2 = rect.r_x2, rect.r_x0
|
# rect.r_x0, rect.r_x2 = rect.r_x2, rect.r_x0
|
||||||
rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3
|
# rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3
|
||||||
|
|
||||||
# rect.r_x2, rect.r_x3 = rect.r_x3, rect.r_x2
|
# rect.r_x2, rect.r_x3 = rect.r_x3, rect.r_x2
|
||||||
|
|
||||||
|
@ -412,7 +412,9 @@ def convert(
|
|||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
pipeline_options.artifacts_path = artifacts_path
|
pipeline_options.artifacts_path = artifacts_path
|
||||||
|
|
||||||
if pdf_backend == PdfBackend.DLPARSE_V2:
|
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||||
|
backend = DoclingParseV2DocumentBackend
|
||||||
|
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||||
backend = DoclingParseV2DocumentBackend
|
backend = DoclingParseV2DocumentBackend
|
||||||
elif pdf_backend == PdfBackend.DLPARSE_V3:
|
elif pdf_backend == PdfBackend.DLPARSE_V3:
|
||||||
backend = DoclingParseV3DocumentBackend # type: ignore
|
backend = DoclingParseV3DocumentBackend # type: ignore
|
||||||
|
@ -299,6 +299,7 @@ class PdfBackend(str, Enum):
|
|||||||
"""Enum of valid PDF backends."""
|
"""Enum of valid PDF backends."""
|
||||||
|
|
||||||
PYPDFIUM2 = "pypdfium2"
|
PYPDFIUM2 = "pypdfium2"
|
||||||
|
DLPARSE_V1 = "dlparse_v1"
|
||||||
DLPARSE_V2 = "dlparse_v2"
|
DLPARSE_V2 = "dlparse_v2"
|
||||||
DLPARSE_V3 = "dlparse_v3"
|
DLPARSE_V3 = "dlparse_v3"
|
||||||
|
|
||||||
@ -381,3 +382,5 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
"before conversion and then use the `TableItem.get_image` function."
|
"before conversion and then use the `TableItem.get_image` function."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
generate_parsed_pages: bool = False
|
||||||
|
@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
|
|||||||
|
|
||||||
class PagePreprocessingOptions(BaseModel):
|
class PagePreprocessingOptions(BaseModel):
|
||||||
images_scale: Optional[float]
|
images_scale: Optional[float]
|
||||||
|
create_parsed_page: bool
|
||||||
|
|
||||||
|
|
||||||
class PagePreprocessingModel(BasePageModel):
|
class PagePreprocessingModel(BasePageModel):
|
||||||
@ -54,7 +55,9 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
|
|
||||||
page.cells = list(page._backend.get_text_cells())
|
page.cells = list(page._backend.get_text_cells())
|
||||||
page.parsed_page = page._backend.get_segmented_page()
|
|
||||||
|
if self.options.create_parsed_page:
|
||||||
|
page.parsed_page = page._backend.get_segmented_page()
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
|
@ -87,7 +87,8 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
# Pre-processing
|
# Pre-processing
|
||||||
PagePreprocessingModel(
|
PagePreprocessingModel(
|
||||||
options=PagePreprocessingOptions(
|
options=PagePreprocessingOptions(
|
||||||
images_scale=pipeline_options.images_scale
|
images_scale=pipeline_options.images_scale,
|
||||||
|
create_parsed_page=pipeline_options.generate_parsed_pages,
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
# OCR
|
# OCR
|
||||||
|
2
poetry.lock
generated
2
poetry.lock
generated
@ -898,7 +898,7 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
|
|||||||
type = "git"
|
type = "git"
|
||||||
url = "https://github.com/DS4SD/docling-core"
|
url = "https://github.com/DS4SD/docling-core"
|
||||||
reference = "cau/docling-parse-types"
|
reference = "cau/docling-parse-types"
|
||||||
resolved_reference = "a2f1fccf80324e74c1ed66574bfa2bc02163e2ae"
|
resolved_reference = "7cb80880a4781e781cf797d42bda34498cf81184"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-ibm-models"
|
name = "docling-ibm-models"
|
||||||
|
77
tests/test_backend_docling_parse.py
Normal file
77
tests/test_backend_docling_parse.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from docling_core.types.doc import BoundingBox
|
||||||
|
|
||||||
|
from docling.backend.docling_parse_backend import (
|
||||||
|
DoclingParseDocumentBackend,
|
||||||
|
DoclingParsePageBackend,
|
||||||
|
)
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_doc_path():
|
||||||
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_backend(pdf_doc):
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=pdf_doc,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_backend = in_doc._backend
|
||||||
|
return doc_backend
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_cell_counts():
|
||||||
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
for page_index in range(0, doc_backend.page_count()):
|
||||||
|
last_cell_count = None
|
||||||
|
for i in range(10):
|
||||||
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
cells = list(page_backend.get_text_cells())
|
||||||
|
|
||||||
|
if last_cell_count is None:
|
||||||
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
|
if len(cells) != last_cell_count:
|
||||||
|
assert (
|
||||||
|
False
|
||||||
|
), "Loading page multiple times yielded non-identical text cell counts"
|
||||||
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_text_from_rect(test_doc_path):
|
||||||
|
doc_backend = _get_backend(test_doc_path)
|
||||||
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
# Get the title text of the DocLayNet paper
|
||||||
|
textpiece = page_backend.get_text_in_rect(
|
||||||
|
bbox=BoundingBox(l=102, t=77, r=511, b=124)
|
||||||
|
)
|
||||||
|
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
|
||||||
|
|
||||||
|
assert textpiece.strip() == ref
|
||||||
|
|
||||||
|
|
||||||
|
def test_crop_page_image(test_doc_path):
|
||||||
|
doc_backend = _get_backend(test_doc_path)
|
||||||
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
|
im = page_backend.get_page_image(
|
||||||
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
|
)
|
||||||
|
# im.show()
|
||||||
|
|
||||||
|
|
||||||
|
def test_num_pages(test_doc_path):
|
||||||
|
doc_backend = _get_backend(test_doc_path)
|
||||||
|
doc_backend.page_count() == 9
|
Loading…
Reference in New Issue
Block a user