feat: Add adaptive OCR, factor out treatment of OCR areas and cell filtering (#38)

* Introduce adaptive OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Factor out BaseOcrModel, add docling-parse backend tests, fixes

* Make easyocr default dep

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-08-20 15:28:03 +02:00
committed by GitHub
parent 47b8ad917e
commit e94d317c02
13 changed files with 285 additions and 83 deletions

View File

@@ -18,6 +18,10 @@ class PdfPageBackend(ABC):
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None

View File

@@ -3,7 +3,7 @@ import random
import time
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import Iterable, Optional, Union
import pypdfium2 as pdfium
from docling_parse.docling_parse import pdf_parser
@@ -43,7 +43,7 @@ class DoclingParsePageBackend(PdfPageBackend):
r=x1 * scale * page_size.width / parser_width,
t=y1 * scale * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(page_size.height * scale)
).to_top_left_origin(page_height=page_size.height * scale)
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
@@ -66,6 +66,12 @@ class DoclingParsePageBackend(PdfPageBackend):
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
Cell(
@@ -108,6 +114,20 @@ class DoclingParsePageBackend(PdfPageBackend):
return cells
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for i in range(len(self._dpage["images"])):
bitmap = self._dpage["images"][i]
cropbox = BoundingBox.from_tuple(
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(self.get_size().height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_page_image(
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
@@ -173,7 +193,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
def page_count(self) -> int:
return len(self._parser_doc["pages"])
def load_page(self, page_no: int) -> PdfPage:
def load_page(self, page_no: int) -> DoclingParsePageBackend:
return DoclingParsePageBackend(
self._pdoc[page_no], self._parser_doc["pages"][page_no]
)

View File

@@ -4,6 +4,7 @@ from pathlib import Path
from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
@@ -17,6 +18,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
self._ppage = page_obj
self.text_page = None
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=self.get_size().height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
@@ -208,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
def page_count(self) -> int:
return len(self._pdoc)
def load_page(self, page_no: int) -> PdfPage:
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
return PyPdfiumPageBackend(self._pdoc[page_no])
def is_valid(self) -> bool: