mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Fixes and test updates
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
78353f1697
commit
f411772569
@ -1,227 +0,0 @@
|
|||||||
import logging
|
|
||||||
import random
|
|
||||||
from io import BytesIO
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Iterable, List, Optional, Union
|
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
|
||||||
from docling_parse.pdf_parsers import pdf_parser_v1
|
|
||||||
from PIL import Image, ImageDraw
|
|
||||||
from pypdfium2 import PdfPage
|
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
||||||
from docling.datamodel.base_models import Cell
|
|
||||||
from docling.datamodel.document import InputDocument
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class DoclingParsePageBackend(PdfPageBackend):
|
|
||||||
def __init__(
|
|
||||||
self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
|
|
||||||
):
|
|
||||||
self._ppage = page_obj
|
|
||||||
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
|
||||||
|
|
||||||
self.valid = "pages" in parsed_page
|
|
||||||
if self.valid:
|
|
||||||
self._dpage = parsed_page["pages"][0]
|
|
||||||
else:
|
|
||||||
_log.info(
|
|
||||||
f"An error occurred when loading page {page_no} of document {document_hash}."
|
|
||||||
)
|
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
|
||||||
return self.valid
|
|
||||||
|
|
||||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
||||||
if not self.valid:
|
|
||||||
return ""
|
|
||||||
# Find intersecting cells on the page
|
|
||||||
text_piece = ""
|
|
||||||
page_size = self.get_size()
|
|
||||||
parser_width = self._dpage["width"]
|
|
||||||
parser_height = self._dpage["height"]
|
|
||||||
|
|
||||||
scale = (
|
|
||||||
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
|
||||||
)
|
|
||||||
|
|
||||||
for i in range(len(self._dpage["cells"])):
|
|
||||||
rect = self._dpage["cells"][i]["box"]["device"]
|
|
||||||
x0, y0, x1, y1 = rect
|
|
||||||
cell_bbox = BoundingBox(
|
|
||||||
l=x0 * scale * page_size.width / parser_width,
|
|
||||||
b=y0 * scale * page_size.height / parser_height,
|
|
||||||
r=x1 * scale * page_size.width / parser_width,
|
|
||||||
t=y1 * scale * page_size.height / parser_height,
|
|
||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
||||||
).to_top_left_origin(page_height=page_size.height * scale)
|
|
||||||
|
|
||||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
|
||||||
if len(text_piece) > 0:
|
|
||||||
text_piece += " "
|
|
||||||
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
|
||||||
|
|
||||||
return text_piece
|
|
||||||
|
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
|
||||||
cells: List[Cell] = []
|
|
||||||
cell_counter = 0
|
|
||||||
|
|
||||||
if not self.valid:
|
|
||||||
return cells
|
|
||||||
|
|
||||||
page_size = self.get_size()
|
|
||||||
|
|
||||||
parser_width = self._dpage["width"]
|
|
||||||
parser_height = self._dpage["height"]
|
|
||||||
|
|
||||||
for i in range(len(self._dpage["cells"])):
|
|
||||||
rect = self._dpage["cells"][i]["box"]["device"]
|
|
||||||
x0, y0, x1, y1 = rect
|
|
||||||
|
|
||||||
if x1 < x0:
|
|
||||||
x0, x1 = x1, x0
|
|
||||||
if y1 < y0:
|
|
||||||
y0, y1 = y1, y0
|
|
||||||
|
|
||||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
|
||||||
cells.append(
|
|
||||||
Cell(
|
|
||||||
id=cell_counter,
|
|
||||||
text=text_piece,
|
|
||||||
bbox=BoundingBox(
|
|
||||||
# l=x0, b=y0, r=x1, t=y1,
|
|
||||||
l=x0 * page_size.width / parser_width,
|
|
||||||
b=y0 * page_size.height / parser_height,
|
|
||||||
r=x1 * page_size.width / parser_width,
|
|
||||||
t=y1 * page_size.height / parser_height,
|
|
||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
||||||
).to_top_left_origin(page_size.height),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
cell_counter += 1
|
|
||||||
|
|
||||||
def draw_clusters_and_cells():
|
|
||||||
image = (
|
|
||||||
self.get_page_image()
|
|
||||||
) # make new image to avoid drawing on the saved ones
|
|
||||||
draw = ImageDraw.Draw(image)
|
|
||||||
for c in cells:
|
|
||||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
|
||||||
cell_color = (
|
|
||||||
random.randint(30, 140),
|
|
||||||
random.randint(30, 140),
|
|
||||||
random.randint(30, 140),
|
|
||||||
)
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
|
||||||
image.show()
|
|
||||||
|
|
||||||
# before merge:
|
|
||||||
# draw_clusters_and_cells()
|
|
||||||
|
|
||||||
# cells = merge_horizontal_cells(cells)
|
|
||||||
|
|
||||||
# after merge:
|
|
||||||
# draw_clusters_and_cells()
|
|
||||||
|
|
||||||
return cells
|
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
||||||
AREA_THRESHOLD = 0 # 32 * 32
|
|
||||||
|
|
||||||
for i in range(len(self._dpage["images"])):
|
|
||||||
bitmap = self._dpage["images"][i]
|
|
||||||
cropbox = BoundingBox.from_tuple(
|
|
||||||
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
|
|
||||||
).to_top_left_origin(self.get_size().height)
|
|
||||||
|
|
||||||
if cropbox.area() > AREA_THRESHOLD:
|
|
||||||
cropbox = cropbox.scaled(scale=scale)
|
|
||||||
|
|
||||||
yield cropbox
|
|
||||||
|
|
||||||
def get_page_image(
|
|
||||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
||||||
) -> Image.Image:
|
|
||||||
|
|
||||||
page_size = self.get_size()
|
|
||||||
|
|
||||||
if not cropbox:
|
|
||||||
cropbox = BoundingBox(
|
|
||||||
l=0,
|
|
||||||
r=page_size.width,
|
|
||||||
t=0,
|
|
||||||
b=page_size.height,
|
|
||||||
coord_origin=CoordOrigin.TOPLEFT,
|
|
||||||
)
|
|
||||||
padbox = BoundingBox(
|
|
||||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
|
||||||
padbox.r = page_size.width - padbox.r
|
|
||||||
padbox.t = page_size.height - padbox.t
|
|
||||||
|
|
||||||
image = (
|
|
||||||
self._ppage.render(
|
|
||||||
scale=scale * 1.5,
|
|
||||||
rotation=0, # no additional rotation
|
|
||||||
crop=padbox.as_tuple(),
|
|
||||||
)
|
|
||||||
.to_pil()
|
|
||||||
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
|
||||||
) # We resize the image from 1.5x the given scale to make it sharper.
|
|
||||||
|
|
||||||
return image
|
|
||||||
|
|
||||||
def get_size(self) -> Size:
|
|
||||||
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
|
||||||
|
|
||||||
def unload(self):
|
|
||||||
self._ppage = None
|
|
||||||
self._dpage = None
|
|
||||||
|
|
||||||
|
|
||||||
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
||||||
super().__init__(in_doc, path_or_stream)
|
|
||||||
|
|
||||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
||||||
self.parser = pdf_parser_v1()
|
|
||||||
|
|
||||||
success = False
|
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
|
||||||
success = self.parser.load_document_from_bytesio(
|
|
||||||
self.document_hash, self.path_or_stream
|
|
||||||
)
|
|
||||||
elif isinstance(self.path_or_stream, Path):
|
|
||||||
success = self.parser.load_document(
|
|
||||||
self.document_hash, str(self.path_or_stream)
|
|
||||||
)
|
|
||||||
|
|
||||||
if not success:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"docling-parse could not load document with hash {self.document_hash}."
|
|
||||||
)
|
|
||||||
|
|
||||||
def page_count(self) -> int:
|
|
||||||
return len(self._pdoc) # To be replaced with docling-parse API
|
|
||||||
|
|
||||||
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
|
||||||
return DoclingParsePageBackend(
|
|
||||||
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
|
||||||
)
|
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
|
||||||
return self.page_count() > 0
|
|
||||||
|
|
||||||
def unload(self):
|
|
||||||
super().unload()
|
|
||||||
self.parser.unload_document(self.document_hash)
|
|
||||||
self._pdoc.close()
|
|
||||||
self._pdoc = None
|
|
@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||||
from docling_parse.pdf_parsers import pdf_parser_v2
|
from docling_parse.pdf_parsers import pdf_parser_v2
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell, Size
|
from docling.datamodel.base_models import Size
|
||||||
from docling.utils.locks import pypdfium2_lock
|
from docling.utils.locks import pypdfium2_lock
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
return text_piece
|
return text_piece
|
||||||
|
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||||
cells: List[Cell] = []
|
return None
|
||||||
|
|
||||||
|
def get_text_cells(self) -> Iterable[TextCell]:
|
||||||
|
cells: List[TextCell] = []
|
||||||
cell_counter = 0
|
cell_counter = 0
|
||||||
|
|
||||||
if not self.valid:
|
if not self.valid:
|
||||||
@ -106,16 +110,19 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
text_piece = cell_data[cells_header.index("text")]
|
text_piece = cell_data[cells_header.index("text")]
|
||||||
cells.append(
|
cells.append(
|
||||||
Cell(
|
TextCell(
|
||||||
id=cell_counter,
|
index=cell_counter,
|
||||||
text=text_piece,
|
text=text_piece,
|
||||||
bbox=BoundingBox(
|
orig=text_piece,
|
||||||
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox(
|
||||||
# l=x0, b=y0, r=x1, t=y1,
|
# l=x0, b=y0, r=x1, t=y1,
|
||||||
l=x0 * page_size.width / parser_width,
|
l=x0 * page_size.width / parser_width,
|
||||||
b=y0 * page_size.height / parser_height,
|
b=y0 * page_size.height / parser_height,
|
||||||
r=x1 * page_size.width / parser_width,
|
r=x1 * page_size.width / parser_width,
|
||||||
t=y1 * page_size.height / parser_height,
|
t=y1 * page_size.height / parser_height,
|
||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
|
)
|
||||||
).to_top_left_origin(page_size.height),
|
).to_top_left_origin(page_size.height),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell, Size
|
from docling.datamodel.base_models import Size
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@ -54,48 +54,15 @@ class DoclingParseV3PageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
return text_piece
|
return text_piece
|
||||||
|
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||||
cells: List[Cell] = []
|
return self._dpage
|
||||||
cell_counter = 0
|
|
||||||
|
|
||||||
|
def get_text_cells(self) -> Iterable[TextCell]:
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
|
||||||
for i, cell in enumerate(self._dpage.textline_cells):
|
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
|
||||||
cell_bbox = cell.rect.to_bounding_box()
|
|
||||||
|
|
||||||
if cell_bbox.r < cell_bbox.l:
|
return self._dpage.textline_cells
|
||||||
cell_bbox.r, cell_bbox.l = cell_bbox.l, cell_bbox.r
|
|
||||||
if cell_bbox.b > cell_bbox.t:
|
|
||||||
cell_bbox.b, cell_bbox.t = cell_bbox.t, cell_bbox.b
|
|
||||||
|
|
||||||
text_piece = cell.text
|
|
||||||
cells.append(
|
|
||||||
Cell(
|
|
||||||
id=cell_counter,
|
|
||||||
text=text_piece,
|
|
||||||
bbox=cell_bbox.to_top_left_origin(page_size.height),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
cell_counter += 1
|
|
||||||
|
|
||||||
def draw_clusters_and_cells():
|
|
||||||
image = (
|
|
||||||
self.get_page_image()
|
|
||||||
) # make new image to avoid drawing on the saved ones
|
|
||||||
draw = ImageDraw.Draw(image)
|
|
||||||
for c in cells:
|
|
||||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
|
||||||
cell_color = (
|
|
||||||
random.randint(30, 140),
|
|
||||||
random.randint(30, 140),
|
|
||||||
random.randint(30, 140),
|
|
||||||
)
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
|
||||||
image.show()
|
|
||||||
|
|
||||||
# draw_clusters_and_cells()
|
|
||||||
|
|
||||||
return cells
|
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 0 # 32 * 32
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
|
@ -4,10 +4,11 @@ from pathlib import Path
|
|||||||
from typing import Iterable, Optional, Set, Union
|
from typing import Iterable, Optional, Set, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, Size
|
from docling_core.types.doc import BoundingBox, Size
|
||||||
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||||
from docling.datamodel.base_models import Cell, InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_text_cells(self) -> Iterable[TextCell]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
import pypdfium2.raw as pdfium_c
|
import pypdfium2.raw as pdfium_c
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pypdfium2 import PdfTextPage
|
from pypdfium2 import PdfTextPage
|
||||||
from pypdfium2._helpers.misc import PdfiumError
|
from pypdfium2._helpers.misc import PdfiumError
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell
|
|
||||||
from docling.utils.locks import pypdfium2_lock
|
from docling.utils.locks import pypdfium2_lock
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
return text_piece
|
return text_piece
|
||||||
|
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_text_cells(self) -> Iterable[TextCell]:
|
||||||
with pypdfium2_lock:
|
with pypdfium2_lock:
|
||||||
if not self.text_page:
|
if not self.text_page:
|
||||||
self.text_page = self._ppage.get_textpage()
|
self.text_page = self._ppage.get_textpage()
|
||||||
@ -84,11 +87,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
text_piece = self.text_page.get_text_bounded(*rect)
|
text_piece = self.text_page.get_text_bounded(*rect)
|
||||||
x0, y0, x1, y1 = rect
|
x0, y0, x1, y1 = rect
|
||||||
cells.append(
|
cells.append(
|
||||||
Cell(
|
TextCell(
|
||||||
id=cell_counter,
|
index=cell_counter,
|
||||||
text=text_piece,
|
text=text_piece,
|
||||||
bbox=BoundingBox(
|
orig=text_piece,
|
||||||
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox(
|
||||||
|
l=x0,
|
||||||
|
b=y0,
|
||||||
|
r=x1,
|
||||||
|
t=y1,
|
||||||
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
|
)
|
||||||
).to_top_left_origin(page_size.height),
|
).to_top_left_origin(page_size.height),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -97,51 +107,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
||||||
# The cell merging code below is to clean this up.
|
# The cell merging code below is to clean this up.
|
||||||
def merge_horizontal_cells(
|
def merge_horizontal_cells(
|
||||||
cells: List[Cell],
|
cells: List[TextCell],
|
||||||
horizontal_threshold_factor: float = 1.0,
|
horizontal_threshold_factor: float = 1.0,
|
||||||
vertical_threshold_factor: float = 0.5,
|
vertical_threshold_factor: float = 0.5,
|
||||||
) -> List[Cell]:
|
) -> List[TextCell]:
|
||||||
if not cells:
|
if not cells:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
|
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
|
||||||
rows = []
|
rows = []
|
||||||
current_row = [cells[0]]
|
current_row = [cells[0]]
|
||||||
row_top = cells[0].bbox.t
|
row_top = cells[0].rect.to_bounding_box().t
|
||||||
row_bottom = cells[0].bbox.b
|
row_bottom = cells[0].rect.to_bounding_box().b
|
||||||
row_height = cells[0].bbox.height
|
row_height = cells[0].rect.to_bounding_box().height
|
||||||
|
|
||||||
for cell in cells[1:]:
|
for cell in cells[1:]:
|
||||||
vertical_threshold = row_height * vertical_threshold_factor
|
vertical_threshold = row_height * vertical_threshold_factor
|
||||||
if (
|
if (
|
||||||
abs(cell.bbox.t - row_top) <= vertical_threshold
|
abs(cell.rect.to_bounding_box().t - row_top)
|
||||||
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
|
<= vertical_threshold
|
||||||
|
and abs(cell.rect.to_bounding_box().b - row_bottom)
|
||||||
|
<= vertical_threshold
|
||||||
):
|
):
|
||||||
current_row.append(cell)
|
current_row.append(cell)
|
||||||
row_top = min(row_top, cell.bbox.t)
|
row_top = min(row_top, cell.rect.to_bounding_box().t)
|
||||||
row_bottom = max(row_bottom, cell.bbox.b)
|
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
|
||||||
row_height = row_bottom - row_top
|
row_height = row_bottom - row_top
|
||||||
else:
|
else:
|
||||||
rows.append(current_row)
|
rows.append(current_row)
|
||||||
current_row = [cell]
|
current_row = [cell]
|
||||||
row_top = cell.bbox.t
|
row_top = cell.rect.to_bounding_box().t
|
||||||
row_bottom = cell.bbox.b
|
row_bottom = cell.rect.to_bounding_box().b
|
||||||
row_height = cell.bbox.height
|
row_height = cell.rect.to_bounding_box().height
|
||||||
|
|
||||||
if current_row:
|
if current_row:
|
||||||
rows.append(current_row)
|
rows.append(current_row)
|
||||||
|
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
def merge_row(row: List[Cell]) -> List[Cell]:
|
def merge_row(row: List[TextCell]) -> List[TextCell]:
|
||||||
merged = []
|
merged = []
|
||||||
current_group = [row[0]]
|
current_group = [row[0]]
|
||||||
|
|
||||||
for cell in row[1:]:
|
for cell in row[1:]:
|
||||||
prev_cell = current_group[-1]
|
prev_cell = current_group[-1]
|
||||||
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
|
avg_height = (
|
||||||
|
prev_cell.rect.height + cell.rect.to_bounding_box().height
|
||||||
|
) / 2
|
||||||
if (
|
if (
|
||||||
cell.bbox.l - prev_cell.bbox.r
|
cell.rect.to_bounding_box().l
|
||||||
|
- prev_cell.rect.to_bounding_box().r
|
||||||
<= avg_height * horizontal_threshold_factor
|
<= avg_height * horizontal_threshold_factor
|
||||||
):
|
):
|
||||||
current_group.append(cell)
|
current_group.append(cell)
|
||||||
@ -154,24 +169,29 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
def merge_group(group: List[Cell]) -> Cell:
|
def merge_group(group: List[TextCell]) -> TextCell:
|
||||||
if len(group) == 1:
|
if len(group) == 1:
|
||||||
return group[0]
|
return group[0]
|
||||||
|
|
||||||
merged_text = "".join(cell.text for cell in group)
|
merged_text = "".join(cell.text for cell in group)
|
||||||
merged_bbox = BoundingBox(
|
merged_bbox = BoundingBox(
|
||||||
l=min(cell.bbox.l for cell in group),
|
l=min(cell.rect.to_bounding_box().l for cell in group),
|
||||||
t=min(cell.bbox.t for cell in group),
|
t=min(cell.rect.to_bounding_box().t for cell in group),
|
||||||
r=max(cell.bbox.r for cell in group),
|
r=max(cell.rect.to_bounding_box().r for cell in group),
|
||||||
b=max(cell.bbox.b for cell in group),
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
||||||
|
)
|
||||||
|
return TextCell(
|
||||||
|
index=group[0].index,
|
||||||
|
text=merged_text,
|
||||||
|
orig=merged_text,
|
||||||
|
rect=BoundingRectangle.from_bounding_box(merged_bbox),
|
||||||
)
|
)
|
||||||
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
|
||||||
|
|
||||||
rows = group_rows(cells)
|
rows = group_rows(cells)
|
||||||
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
||||||
|
|
||||||
for i, cell in enumerate(merged_cells, 1):
|
for i, cell in enumerate(merged_cells, 1):
|
||||||
cell.id = i
|
cell.index = i
|
||||||
|
|
||||||
return merged_cells
|
return merged_cells
|
||||||
|
|
||||||
@ -181,7 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
) # make new image to avoid drawing on the saved ones
|
) # make new image to avoid drawing on the saved ones
|
||||||
draw = ImageDraw.Draw(image)
|
draw = ImageDraw.Draw(image)
|
||||||
for c in cells:
|
for c in cells:
|
||||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||||
cell_color = (
|
cell_color = (
|
||||||
random.randint(30, 140),
|
random.randint(30, 140),
|
||||||
random.randint(30, 140),
|
random.randint(30, 140),
|
||||||
|
@ -14,8 +14,8 @@ from docling_core.types.doc import ImageRefMode
|
|||||||
from docling_core.utils.file import resolve_source_to_path
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
|
from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
@ -412,12 +412,12 @@ def convert(
|
|||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
pipeline_options.artifacts_path = artifacts_path
|
pipeline_options.artifacts_path = artifacts_path
|
||||||
|
|
||||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
if pdf_backend == PdfBackend.DLPARSE_V2:
|
||||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
|
||||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
|
||||||
backend = DoclingParseV2DocumentBackend
|
backend = DoclingParseV2DocumentBackend
|
||||||
|
elif pdf_backend == PdfBackend.DLPARSE_V3:
|
||||||
|
backend = DoclingParseV3DocumentBackend # type: ignore
|
||||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||||
backend = PyPdfiumDocumentBackend
|
backend = PyPdfiumDocumentBackend # type: ignore
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ from docling_core.types.doc import (
|
|||||||
Size,
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
)
|
)
|
||||||
@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
|
|||||||
error_message: str
|
error_message: str
|
||||||
|
|
||||||
|
|
||||||
class Cell(BaseModel):
|
# class Cell(BaseModel):
|
||||||
id: int
|
# id: int
|
||||||
text: str
|
# text: str
|
||||||
bbox: BoundingBox
|
# bbox: BoundingBox
|
||||||
|
|
||||||
|
|
||||||
class OcrCell(Cell):
|
|
||||||
confidence: float
|
|
||||||
|
|
||||||
|
|
||||||
class Cluster(BaseModel):
|
class Cluster(BaseModel):
|
||||||
@ -138,7 +135,7 @@ class Cluster(BaseModel):
|
|||||||
label: DocItemLabel
|
label: DocItemLabel
|
||||||
bbox: BoundingBox
|
bbox: BoundingBox
|
||||||
confidence: float = 1.0
|
confidence: float = 1.0
|
||||||
cells: List[Cell] = []
|
cells: List[TextCell] = []
|
||||||
children: List["Cluster"] = [] # Add child cluster support
|
children: List["Cluster"] = [] # Add child cluster support
|
||||||
|
|
||||||
|
|
||||||
@ -226,7 +223,8 @@ class Page(BaseModel):
|
|||||||
page_no: int
|
page_no: int
|
||||||
# page_hash: Optional[str] = None
|
# page_hash: Optional[str] = None
|
||||||
size: Optional[Size] = None
|
size: Optional[Size] = None
|
||||||
cells: List[Cell] = []
|
cells: List[TextCell] = []
|
||||||
|
parsed_page: Optional[SegmentedPdfPage] = None
|
||||||
predictions: PagePredictions = PagePredictions()
|
predictions: PagePredictions = PagePredictions()
|
||||||
assembled: Optional[AssembledUnit] = None
|
assembled: Optional[AssembledUnit] = None
|
||||||
|
|
||||||
|
@ -299,8 +299,8 @@ class PdfBackend(str, Enum):
|
|||||||
"""Enum of valid PDF backends."""
|
"""Enum of valid PDF backends."""
|
||||||
|
|
||||||
PYPDFIUM2 = "pypdfium2"
|
PYPDFIUM2 = "pypdfium2"
|
||||||
DLPARSE_V1 = "dlparse_v1"
|
|
||||||
DLPARSE_V2 = "dlparse_v2"
|
DLPARSE_V2 = "dlparse_v2"
|
||||||
|
DLPARSE_V3 = "dlparse_v3"
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the ocr engines
|
# Define an enum for the ocr engines
|
||||||
|
@ -6,11 +6,12 @@ from typing import Iterable, List
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import binary_dilation, find_objects, label
|
from scipy.ndimage import binary_dilation, find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrOptions
|
from docling.datamodel.pipeline_options import OcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
|
|||||||
p.dimension = 2
|
p.dimension = 2
|
||||||
idx = index.Index(properties=p)
|
idx = index.Index(properties=p)
|
||||||
for i, cell in enumerate(programmatic_cells):
|
for i, cell in enumerate(programmatic_cells):
|
||||||
idx.insert(i, cell.bbox.as_tuple())
|
idx.insert(i, cell.rect.to_bounding_box().as_tuple())
|
||||||
|
|
||||||
def is_overlapping_with_existing_cells(ocr_cell):
|
def is_overlapping_with_existing_cells(ocr_cell):
|
||||||
# Query the R-tree to get overlapping rectangles
|
# Query the R-tree to get overlapping rectangles
|
||||||
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
|
possible_matches_index = list(
|
||||||
|
idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
|
||||||
|
)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
len(possible_matches_index) > 0
|
len(possible_matches_index) > 0
|
||||||
@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
|
|||||||
"""
|
"""
|
||||||
if self.options.force_full_page_ocr:
|
if self.options.force_full_page_ocr:
|
||||||
# If a full page OCR is forced, use only the OCR cells
|
# If a full page OCR is forced, use only the OCR cells
|
||||||
cells = [
|
cells = ocr_cells
|
||||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
|
||||||
for c_ocr in ocr_cells
|
|
||||||
]
|
|
||||||
return cells
|
return cells
|
||||||
|
|
||||||
## Remove OCR cells which overlap with programmatic cells.
|
## Remove OCR cells which overlap with programmatic cells.
|
||||||
@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):
|
|||||||
|
|
||||||
# Draw OCR and programmatic cells
|
# Draw OCR and programmatic cells
|
||||||
for tc in page.cells:
|
for tc in page.cells:
|
||||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
|
||||||
y0 *= scale_x
|
y0 *= scale_x
|
||||||
y1 *= scale_y
|
y1 *= scale_y
|
||||||
x0 *= scale_x
|
x0 *= scale_x
|
||||||
@ -165,9 +165,10 @@ class BaseOcrModel(BasePageModel):
|
|||||||
if y1 <= y0:
|
if y1 <= y0:
|
||||||
y1, y0 = y0, y1
|
y1, y0 = y0, y1
|
||||||
|
|
||||||
color = "gray"
|
|
||||||
if isinstance(tc, OcrCell):
|
|
||||||
color = "magenta"
|
color = "magenta"
|
||||||
|
if isinstance(tc, PdfTextCell):
|
||||||
|
color = "gray"
|
||||||
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
||||||
|
|
||||||
if show:
|
if show:
|
||||||
|
@ -6,8 +6,9 @@ from typing import Iterable, List, Optional
|
|||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
@ -148,11 +149,13 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
del im
|
del im
|
||||||
|
|
||||||
cells = [
|
cells = [
|
||||||
OcrCell(
|
TextCell(
|
||||||
id=ix,
|
index=ix,
|
||||||
text=line[1],
|
text=line[1],
|
||||||
|
orig=line[1],
|
||||||
confidence=line[2],
|
confidence=line[2],
|
||||||
bbox=BoundingBox.from_tuple(
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox.from_tuple(
|
||||||
coord=(
|
coord=(
|
||||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||||
@ -160,6 +163,7 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||||
),
|
),
|
||||||
origin=CoordOrigin.TOPLEFT,
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
for ix, line in enumerate(result)
|
for ix, line in enumerate(result)
|
||||||
|
@ -3,8 +3,9 @@ import tempfile
|
|||||||
from typing import Iterable, Optional, Tuple
|
from typing import Iterable, Optional, Tuple
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
from docling.datamodel.pipeline_options import OcrMacOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -94,13 +95,16 @@ class OcrMacModel(BaseOcrModel):
|
|||||||
bottom = y2 / self.scale
|
bottom = y2 / self.scale
|
||||||
|
|
||||||
cells.append(
|
cells.append(
|
||||||
OcrCell(
|
TextCell(
|
||||||
id=ix,
|
index=ix,
|
||||||
text=text,
|
text=text,
|
||||||
|
orig=text,
|
||||||
confidence=confidence,
|
confidence=confidence,
|
||||||
bbox=BoundingBox.from_tuple(
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox.from_tuple(
|
||||||
coord=(left, top, right, bottom),
|
coord=(left, top, right, bottom),
|
||||||
origin=CoordOrigin.TOPLEFT,
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -54,6 +54,7 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
|
|
||||||
page.cells = list(page._backend.get_text_cells())
|
page.cells = list(page._backend.get_text_cells())
|
||||||
|
page.parsed_page = page._backend.get_segmented_page()
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
|
@ -3,8 +3,9 @@ from typing import Iterable
|
|||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
@ -100,18 +101,25 @@ class RapidOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
cells = [
|
cells = [
|
||||||
OcrCell(
|
TextCell(
|
||||||
id=ix,
|
index=ix,
|
||||||
text=line[1],
|
text=line[1],
|
||||||
|
orig=line[1],
|
||||||
confidence=line[2],
|
confidence=line[2],
|
||||||
bbox=BoundingBox.from_tuple(
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox.from_tuple(
|
||||||
coord=(
|
coord=(
|
||||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
(line[0][0][0] / self.scale)
|
||||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
+ ocr_rect.l,
|
||||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
(line[0][0][1] / self.scale)
|
||||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
+ ocr_rect.t,
|
||||||
|
(line[0][2][0] / self.scale)
|
||||||
|
+ ocr_rect.l,
|
||||||
|
(line[0][2][1] / self.scale)
|
||||||
|
+ ocr_rect.t,
|
||||||
),
|
),
|
||||||
origin=CoordOrigin.TOPLEFT,
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
for ix, line in enumerate(result)
|
for ix, line in enumerate(result)
|
||||||
|
@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
|
|||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle
|
||||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
|
|||||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||||
|
|
||||||
for cell in table_element.cluster.cells:
|
for cell in table_element.cluster.cells:
|
||||||
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
|
||||||
x0 *= scale_x
|
x0 *= scale_x
|
||||||
x1 *= scale_x
|
x1 *= scale_x
|
||||||
y0 *= scale_x
|
y0 *= scale_x
|
||||||
@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
|
|||||||
# Only allow non empty stings (spaces) into the cells of a table
|
# Only allow non empty stings (spaces) into the cells of a table
|
||||||
if len(c.text.strip()) > 0:
|
if len(c.text.strip()) > 0:
|
||||||
new_cell = copy.deepcopy(c)
|
new_cell = copy.deepcopy(c)
|
||||||
new_cell.bbox = new_cell.bbox.scaled(
|
new_cell.rect = BoundingRectangle.from_bounding_box(
|
||||||
|
new_cell.rect.to_bounding_box().scaled(
|
||||||
scale=self.scale
|
scale=self.scale
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
tokens.append(new_cell.model_dump())
|
tokens.append(
|
||||||
|
{
|
||||||
|
"id": new_cell.index,
|
||||||
|
"text": new_cell.text,
|
||||||
|
"bbox": new_cell.rect.to_bounding_box().model_dump(),
|
||||||
|
}
|
||||||
|
)
|
||||||
page_input["tokens"] = tokens
|
page_input["tokens"] = tokens
|
||||||
|
|
||||||
tf_output = self.tf_predictor.multi_table_predict(
|
tf_output = self.tf_predictor.multi_table_predict(
|
||||||
|
@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -228,11 +229,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
t = b + h
|
t = b + h
|
||||||
r = l + w
|
r = l + w
|
||||||
|
|
||||||
cell = OcrCell(
|
cell = TextCell(
|
||||||
id=ix,
|
index=ix,
|
||||||
text=text,
|
text=text,
|
||||||
|
orig=text,
|
||||||
confidence=conf / 100.0,
|
confidence=conf / 100.0,
|
||||||
bbox=BoundingBox.from_tuple(
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox.from_tuple(
|
||||||
coord=(
|
coord=(
|
||||||
(l / self.scale) + ocr_rect.l,
|
(l / self.scale) + ocr_rect.l,
|
||||||
(b / self.scale) + ocr_rect.t,
|
(b / self.scale) + ocr_rect.t,
|
||||||
@ -240,6 +243,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
(t / self.scale) + ocr_rect.t,
|
(t / self.scale) + ocr_rect.t,
|
||||||
),
|
),
|
||||||
origin=CoordOrigin.TOPLEFT,
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
all_ocr_cells.append(cell)
|
all_ocr_cells.append(cell)
|
||||||
|
@ -2,8 +2,9 @@ import logging
|
|||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -173,14 +174,17 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
top = (box["y"] + box["h"]) / self.scale
|
top = (box["y"] + box["h"]) / self.scale
|
||||||
|
|
||||||
cells.append(
|
cells.append(
|
||||||
OcrCell(
|
TextCell(
|
||||||
id=ix,
|
index=ix,
|
||||||
text=text,
|
text=text,
|
||||||
|
orig=text,
|
||||||
confidence=confidence,
|
confidence=confidence,
|
||||||
bbox=BoundingBox.from_tuple(
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
BoundingBox.from_tuple(
|
||||||
coord=(left, top, right, bottom),
|
coord=(left, top, right, bottom),
|
||||||
origin=CoordOrigin.TOPLEFT,
|
origin=CoordOrigin.TOPLEFT,
|
||||||
),
|
),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2,9 +2,9 @@ import logging
|
|||||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import TextCell
|
||||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell
|
|
||||||
from docling.datamodel.document import ConversionResult, Page
|
from docling.datamodel.document import ConversionResult, Page
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -86,11 +86,13 @@ def generate_multimodal_pages(
|
|||||||
if page.size is None:
|
if page.size is None:
|
||||||
return cells
|
return cells
|
||||||
for cell in page.cells:
|
for cell in page.cells:
|
||||||
new_bbox = cell.bbox.to_top_left_origin(
|
new_bbox = (
|
||||||
page_height=page.size.height
|
cell.rect.to_bounding_box()
|
||||||
).normalized(page_size=page.size)
|
.to_top_left_origin(page_height=page.size.height)
|
||||||
is_ocr = isinstance(cell, OcrCell)
|
.normalized(page_size=page.size)
|
||||||
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
|
)
|
||||||
|
is_ocr = isinstance(cell, TextCell)
|
||||||
|
ocr_confidence = cell.confidence if isinstance(cell, TextCell) else 1.0
|
||||||
cells.append(
|
cells.append(
|
||||||
{
|
{
|
||||||
"text": cell.text,
|
"text": cell.text,
|
||||||
|
@ -5,9 +5,10 @@ from collections import defaultdict
|
|||||||
from typing import Dict, List, Set, Tuple
|
from typing import Dict, List, Set, Tuple
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, Size
|
from docling_core.types.doc import DocItemLabel, Size
|
||||||
|
from docling_core.types.doc.page import TextCell
|
||||||
from rtree import index
|
from rtree import index
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
from docling.datamodel.base_models import BoundingBox, Cluster
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -198,7 +199,7 @@ class LayoutPostprocessor:
|
|||||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
|
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
|
||||||
"""Initialize processor with cells and clusters."""
|
"""Initialize processor with cells and clusters."""
|
||||||
"""Initialize processor with cells and spatial indices."""
|
"""Initialize processor with cells and spatial indices."""
|
||||||
self.cells = cells
|
self.cells = cells
|
||||||
@ -218,7 +219,7 @@ class LayoutPostprocessor:
|
|||||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||||
)
|
)
|
||||||
|
|
||||||
def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
|
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||||
"""Main processing pipeline."""
|
"""Main processing pipeline."""
|
||||||
self.regular_clusters = self._process_regular_clusters()
|
self.regular_clusters = self._process_regular_clusters()
|
||||||
self.special_clusters = self._process_special_clusters()
|
self.special_clusters = self._process_special_clusters()
|
||||||
@ -272,14 +273,14 @@ class LayoutPostprocessor:
|
|||||||
orphan_clusters = []
|
orphan_clusters = []
|
||||||
for i, cell in enumerate(unassigned):
|
for i, cell in enumerate(unassigned):
|
||||||
conf = 1.0
|
conf = 1.0
|
||||||
if isinstance(cell, OcrCell):
|
if isinstance(cell, TextCell):
|
||||||
conf = cell.confidence
|
conf = cell.confidence
|
||||||
|
|
||||||
orphan_clusters.append(
|
orphan_clusters.append(
|
||||||
Cluster(
|
Cluster(
|
||||||
id=next_id + i,
|
id=next_id + i,
|
||||||
label=DocItemLabel.TEXT,
|
label=DocItemLabel.TEXT,
|
||||||
bbox=cell.bbox,
|
bbox=cell.to_bounding_box(),
|
||||||
confidence=conf,
|
confidence=conf,
|
||||||
cells=[cell],
|
cells=[cell],
|
||||||
)
|
)
|
||||||
@ -557,13 +558,13 @@ class LayoutPostprocessor:
|
|||||||
|
|
||||||
return current_best if current_best else clusters[0]
|
return current_best if current_best else clusters[0]
|
||||||
|
|
||||||
def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
|
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||||
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
||||||
seen_ids = set()
|
seen_ids = set()
|
||||||
unique_cells = []
|
unique_cells = []
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
if cell.id not in seen_ids:
|
if cell.index not in seen_ids:
|
||||||
seen_ids.add(cell.id)
|
seen_ids.add(cell.index)
|
||||||
unique_cells.append(cell)
|
unique_cells.append(cell)
|
||||||
return unique_cells
|
return unique_cells
|
||||||
|
|
||||||
@ -582,11 +583,13 @@ class LayoutPostprocessor:
|
|||||||
best_cluster = None
|
best_cluster = None
|
||||||
|
|
||||||
for cluster in clusters:
|
for cluster in clusters:
|
||||||
if cell.bbox.area() <= 0:
|
if cell.rect.to_bounding_box().area() <= 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
overlap = cell.bbox.intersection_area_with(cluster.bbox)
|
overlap = cell.rect.to_bounding_box().intersection_area_with(
|
||||||
overlap_ratio = overlap / cell.bbox.area()
|
cluster.bbox
|
||||||
|
)
|
||||||
|
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
||||||
|
|
||||||
if overlap_ratio > best_overlap:
|
if overlap_ratio > best_overlap:
|
||||||
best_overlap = overlap_ratio
|
best_overlap = overlap_ratio
|
||||||
@ -601,11 +604,13 @@ class LayoutPostprocessor:
|
|||||||
|
|
||||||
return clusters
|
return clusters
|
||||||
|
|
||||||
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
|
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
|
||||||
"""Find cells not assigned to any cluster."""
|
"""Find cells not assigned to any cluster."""
|
||||||
assigned = {cell.id for cluster in clusters for cell in cluster.cells}
|
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
|
||||||
return [
|
return [
|
||||||
cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
|
cell
|
||||||
|
for cell in self.cells
|
||||||
|
if cell.index not in assigned and cell.text.strip()
|
||||||
]
|
]
|
||||||
|
|
||||||
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
||||||
@ -615,10 +620,10 @@ class LayoutPostprocessor:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
cells_bbox = BoundingBox(
|
cells_bbox = BoundingBox(
|
||||||
l=min(cell.bbox.l for cell in cluster.cells),
|
l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
|
||||||
t=min(cell.bbox.t for cell in cluster.cells),
|
t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
|
||||||
r=max(cell.bbox.r for cell in cluster.cells),
|
r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
|
||||||
b=max(cell.bbox.b for cell in cluster.cells),
|
b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
|
||||||
)
|
)
|
||||||
|
|
||||||
if cluster.label == DocItemLabel.TABLE:
|
if cluster.label == DocItemLabel.TABLE:
|
||||||
@ -634,9 +639,9 @@ class LayoutPostprocessor:
|
|||||||
|
|
||||||
return clusters
|
return clusters
|
||||||
|
|
||||||
def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
|
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||||
"""Sort cells in native reading order."""
|
"""Sort cells in native reading order."""
|
||||||
return sorted(cells, key=lambda c: (c.id))
|
return sorted(cells, key=lambda c: (c.index))
|
||||||
|
|
||||||
def _sort_clusters(
|
def _sort_clusters(
|
||||||
self, clusters: List[Cluster], mode: str = "id"
|
self, clusters: List[Cluster], mode: str = "id"
|
||||||
@ -647,7 +652,7 @@ class LayoutPostprocessor:
|
|||||||
clusters,
|
clusters,
|
||||||
key=lambda cluster: (
|
key=lambda cluster: (
|
||||||
(
|
(
|
||||||
min(cell.id for cell in cluster.cells)
|
min(cell.index for cell in cluster.cells)
|
||||||
if cluster.cells
|
if cluster.cells
|
||||||
else sys.maxsize
|
else sys.maxsize
|
||||||
),
|
),
|
||||||
|
@ -25,7 +25,7 @@ def draw_clusters(
|
|||||||
# Draw cells first (underneath)
|
# Draw cells first (underneath)
|
||||||
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
||||||
for tc in c.cells:
|
for tc in c.cells:
|
||||||
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
|
||||||
cx0 *= scale_x
|
cx0 *= scale_x
|
||||||
cx1 *= scale_x
|
cx1 *= scale_x
|
||||||
cy0 *= scale_x
|
cy0 *= scale_x
|
||||||
|
2
poetry.lock
generated
2
poetry.lock
generated
@ -898,7 +898,7 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
|
|||||||
type = "git"
|
type = "git"
|
||||||
url = "https://github.com/DS4SD/docling-core"
|
url = "https://github.com/DS4SD/docling-core"
|
||||||
reference = "cau/docling-parse-types"
|
reference = "cau/docling-parse-types"
|
||||||
resolved_reference = "31db5b0225a4baa8be5f26cc50050cf4bc845204"
|
resolved_reference = "5f404c0270408ba794c18f8d6923cfa9f2980d73"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-ibm-models"
|
name = "docling-ibm-models"
|
||||||
|
@ -1,77 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from docling_core.types.doc import BoundingBox
|
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import (
|
|
||||||
DoclingParseDocumentBackend,
|
|
||||||
DoclingParsePageBackend,
|
|
||||||
)
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
|
||||||
from docling.datamodel.document import InputDocument
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def test_doc_path():
|
|
||||||
return Path("./tests/data/pdf/2206.01062.pdf")
|
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
|
||||||
in_doc = InputDocument(
|
|
||||||
path_or_stream=pdf_doc,
|
|
||||||
format=InputFormat.PDF,
|
|
||||||
backend=DoclingParseDocumentBackend,
|
|
||||||
)
|
|
||||||
|
|
||||||
doc_backend = in_doc._backend
|
|
||||||
return doc_backend
|
|
||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
|
||||||
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
|
||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
|
||||||
last_cell_count = None
|
|
||||||
for i in range(10):
|
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
|
||||||
cells = list(page_backend.get_text_cells())
|
|
||||||
|
|
||||||
if last_cell_count is None:
|
|
||||||
last_cell_count = len(cells)
|
|
||||||
|
|
||||||
if len(cells) != last_cell_count:
|
|
||||||
assert (
|
|
||||||
False
|
|
||||||
), "Loading page multiple times yielded non-identical text cell counts"
|
|
||||||
last_cell_count = len(cells)
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_text_from_rect(test_doc_path):
|
|
||||||
doc_backend = _get_backend(test_doc_path)
|
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
|
||||||
|
|
||||||
# Get the title text of the DocLayNet paper
|
|
||||||
textpiece = page_backend.get_text_in_rect(
|
|
||||||
bbox=BoundingBox(l=102, t=77, r=511, b=124)
|
|
||||||
)
|
|
||||||
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
|
|
||||||
|
|
||||||
assert textpiece.strip() == ref
|
|
||||||
|
|
||||||
|
|
||||||
def test_crop_page_image(test_doc_path):
|
|
||||||
doc_backend = _get_backend(test_doc_path)
|
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
|
||||||
im = page_backend.get_page_image(
|
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
|
||||||
)
|
|
||||||
# im.show()
|
|
||||||
|
|
||||||
|
|
||||||
def test_num_pages(test_doc_path):
|
|
||||||
doc_backend = _get_backend(test_doc_path)
|
|
||||||
doc_backend.page_count() == 9
|
|
@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
|||||||
from docling_core.types.doc import CodeItem, TextItem
|
from docling_core.types.doc import CodeItem, TextItem
|
||||||
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
||||||
@ -33,7 +33,7 @@ def get_converter():
|
|||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -2,7 +2,7 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
@ -44,7 +44,7 @@ def get_converter(ocr_options: OcrOptions):
|
|||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=DoclingParseDocumentBackend,
|
backend=DoclingParseV3DocumentBackend,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
@ -30,7 +30,7 @@ def converter():
|
|||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
pipeline_options=pipeline_options, backend=DoclingParseV3DocumentBackend
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
@ -33,7 +33,7 @@ def get_converters_with_table_options():
|
|||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=DoclingParseDocumentBackend,
|
backend=DoclingParseV3DocumentBackend,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -79,8 +79,8 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
|
|||||||
pred_text = cell_pred_item.text
|
pred_text = cell_pred_item.text
|
||||||
assert true_text == pred_text, f"{true_text}!={pred_text}"
|
assert true_text == pred_text, f"{true_text}!={pred_text}"
|
||||||
|
|
||||||
true_bbox = cell_true_item.bbox.as_tuple()
|
true_bbox = cell_true_item.rect.to_bounding_box().as_tuple()
|
||||||
pred_bbox = cell_pred_item.bbox.as_tuple()
|
pred_bbox = cell_pred_item.rect.to_bounding_box().as_tuple()
|
||||||
assert (
|
assert (
|
||||||
true_bbox == pred_bbox
|
true_bbox == pred_bbox
|
||||||
), f"bbox is not the same: {true_bbox} != {pred_bbox}"
|
), f"bbox is not the same: {true_bbox} != {pred_bbox}"
|
||||||
|
Loading…
Reference in New Issue
Block a user