mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-18 09:31:02 +00:00
feat: Add pipeline timings and toggle visualization, establish debug settings (#183)
* Add settings to turn visualization on or off Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add profiling code to all models Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Refactor and fix profiling codes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Visualization codes output PNG to debug dir Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for time logging Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Optimize imports Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add start_timestamps to ProfilingItem Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -4,11 +4,14 @@ from typing import Any, Iterable
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
|
||||
|
||||
class BasePageModel(ABC):
|
||||
@abstractmethod
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import copy
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
import numpy as np
|
||||
@@ -10,12 +11,15 @@ from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseOcrModel:
|
||||
class BaseOcrModel(BasePageModel):
|
||||
def __init__(self, enabled: bool, options: OcrOptions):
|
||||
self.enabled = enabled
|
||||
self.options = options
|
||||
@@ -113,7 +117,7 @@ class BaseOcrModel:
|
||||
]
|
||||
return filtered_ocr_cells
|
||||
|
||||
def draw_ocr_rects_and_cells(self, page, ocr_rects):
|
||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||
image = copy.deepcopy(page.image)
|
||||
draw = ImageDraw.Draw(image, "RGBA")
|
||||
|
||||
@@ -130,8 +134,21 @@ class BaseOcrModel:
|
||||
if isinstance(tc, OcrCell):
|
||||
color = "magenta"
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
||||
image.show()
|
||||
|
||||
if show:
|
||||
image.show()
|
||||
else:
|
||||
out_path: Path = (
|
||||
Path(settings.debug.debug_output_path)
|
||||
/ f"debug_{conv_res.input.file.stem}"
|
||||
)
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
|
||||
image.save(str(out_file), format="png")
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
pass
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import copy
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
from deepsearch_glm.nlp_utils import init_nlp_model
|
||||
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docling.utils.utils import create_hash
|
||||
|
||||
|
||||
@@ -226,23 +229,24 @@ class GlmModel:
|
||||
return ds_doc
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
ds_doc = self._to_legacy_document(conv_res)
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
||||
ds_doc = self._to_legacy_document(conv_res)
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells(ds_document, page_no):
|
||||
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
||||
clusters_to_draw = []
|
||||
image = copy.deepcopy(conv_res.pages[page_no].image)
|
||||
for ix, elem in enumerate(ds_document.main_text):
|
||||
if isinstance(elem, BaseText):
|
||||
prov = elem.prov[0]
|
||||
prov = elem.prov[0] # type: ignore
|
||||
elif isinstance(elem, Ref):
|
||||
_, arr, index = elem.ref.split("/")
|
||||
index = int(index)
|
||||
index = int(index) # type: ignore
|
||||
if arr == "tables":
|
||||
prov = ds_document.tables[index].prov[0]
|
||||
elif arr == "figures":
|
||||
@@ -256,7 +260,7 @@ class GlmModel:
|
||||
id=ix,
|
||||
label=elem.name,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=prov.bbox,
|
||||
coord=prov.bbox, # type: ignore
|
||||
origin=CoordOrigin.BOTTOMLEFT,
|
||||
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
||||
)
|
||||
@@ -276,9 +280,21 @@ class GlmModel:
|
||||
for tc in c.cells: # [:1]:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells(ds_doc, 0)
|
||||
# draw_clusters_and_cells(exported_doc, 0)
|
||||
if show:
|
||||
image.show()
|
||||
else:
|
||||
out_path: Path = (
|
||||
Path(settings.debug.debug_output_path)
|
||||
/ f"debug_{conv_res.input.file.stem}"
|
||||
)
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out_file = out_path / f"doc_page_{page_no:05}.png"
|
||||
image.save(str(out_file), format="png")
|
||||
|
||||
# for item in ds_doc.page_dimensions:
|
||||
# page_no = item.page
|
||||
# draw_clusters_and_cells(ds_doc, page_no)
|
||||
|
||||
return docling_doc
|
||||
|
||||
@@ -5,8 +5,11 @@ import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -33,58 +36,65 @@ class EasyOcrModel(BaseOcrModel):
|
||||
download_enabled=self.options.download_enabled,
|
||||
)
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
im = numpy.array(high_res_image)
|
||||
result = self.reader.readtext(im)
|
||||
|
||||
del high_res_image
|
||||
del im
|
||||
|
||||
cells = [
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=line[1],
|
||||
confidence=line[2],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
]
|
||||
all_ocr_cells.extend(cells)
|
||||
im = numpy.array(high_res_image)
|
||||
result = self.reader.readtext(im)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
del high_res_image
|
||||
del im
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
cells = [
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=line[1],
|
||||
confidence=line[2],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
]
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||
if settings.debug.visualize_ocr:
|
||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||
|
||||
yield page
|
||||
|
||||
@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
|
||||
LayoutPrediction,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils import layout_utils as lu
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
|
||||
|
||||
return clusters_out_new, cells_out_new
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
assert page.size is not None
|
||||
with TimeRecorder(conv_res, "layout"):
|
||||
assert page.size is not None
|
||||
|
||||
clusters = []
|
||||
for ix, pred_item in enumerate(
|
||||
self.layout_predictor.predict(page.get_image(scale=1.0))
|
||||
):
|
||||
label = DocItemLabel(
|
||||
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
label=label,
|
||||
confidence=pred_item["confidence"],
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
cells=[],
|
||||
)
|
||||
clusters.append(cluster)
|
||||
|
||||
# Map cells to clusters
|
||||
# TODO: Remove, postprocess should take care of it anyway.
|
||||
for cell in page.cells:
|
||||
for cluster in clusters:
|
||||
if not cell.bbox.area() > 0:
|
||||
overlap_frac = 0.0
|
||||
else:
|
||||
overlap_frac = (
|
||||
cell.bbox.intersection_area_with(cluster.bbox)
|
||||
/ cell.bbox.area()
|
||||
)
|
||||
|
||||
if overlap_frac > 0.5:
|
||||
cluster.cells.append(cell)
|
||||
|
||||
# Pre-sort clusters
|
||||
# clusters = self.sort_clusters_by_cell_order(clusters)
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells():
|
||||
image = copy.deepcopy(page.image)
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in clusters:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
||||
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
clusters = []
|
||||
for ix, pred_item in enumerate(
|
||||
self.layout_predictor.predict(page.get_image(scale=1.0))
|
||||
):
|
||||
label = DocItemLabel(
|
||||
pred_item["label"]
|
||||
.lower()
|
||||
.replace(" ", "_")
|
||||
.replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
label=label,
|
||||
confidence=pred_item["confidence"],
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
cells=[],
|
||||
)
|
||||
for tc in c.cells: # [:1]:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
clusters.append(cluster)
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
# Map cells to clusters
|
||||
# TODO: Remove, postprocess should take care of it anyway.
|
||||
for cell in page.cells:
|
||||
for cluster in clusters:
|
||||
if not cell.bbox.area() > 0:
|
||||
overlap_frac = 0.0
|
||||
else:
|
||||
overlap_frac = (
|
||||
cell.bbox.intersection_area_with(cluster.bbox)
|
||||
/ cell.bbox.area()
|
||||
)
|
||||
|
||||
clusters, page.cells = self.postprocess(
|
||||
clusters, page.cells, page.size.height
|
||||
)
|
||||
if overlap_frac > 0.5:
|
||||
cluster.cells.append(cell)
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
# Pre-sort clusters
|
||||
# clusters = self.sort_clusters_by_cell_order(clusters)
|
||||
|
||||
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells(show: bool = False):
|
||||
image = copy.deepcopy(page.image)
|
||||
if image is not None:
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in clusters:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
||||
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
for tc in c.cells: # [:1]:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle(
|
||||
[(x0, y0), (x1, y1)], outline=cell_color
|
||||
)
|
||||
if show:
|
||||
image.show()
|
||||
else:
|
||||
out_path: Path = (
|
||||
Path(settings.debug.debug_output_path)
|
||||
/ f"debug_{conv_res.input.file.stem}"
|
||||
)
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out_file = (
|
||||
out_path / f"layout_page_{page.page_no:05}.png"
|
||||
)
|
||||
image.save(str(out_file), format="png")
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
clusters, page.cells = self.postprocess(
|
||||
clusters, page.cells, page.size.height
|
||||
)
|
||||
|
||||
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
||||
|
||||
if settings.debug.visualize_layout:
|
||||
draw_clusters_and_cells()
|
||||
|
||||
yield page
|
||||
|
||||
@@ -12,8 +12,10 @@ from docling.datamodel.base_models import (
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -51,122 +53,122 @@ class PageAssembleModel(BasePageModel):
|
||||
|
||||
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
assert page.predictions.layout is not None
|
||||
with TimeRecorder(conv_res, "page_assemble"):
|
||||
|
||||
# assembles some JSON output page by page.
|
||||
assert page.predictions.layout is not None
|
||||
|
||||
elements: List[PageElement] = []
|
||||
headers: List[PageElement] = []
|
||||
body: List[PageElement] = []
|
||||
# assembles some JSON output page by page.
|
||||
|
||||
for cluster in page.predictions.layout.clusters:
|
||||
# _log.info("Cluster label seen:", cluster.label)
|
||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||
elements: List[PageElement] = []
|
||||
headers: List[PageElement] = []
|
||||
body: List[PageElement] = []
|
||||
|
||||
textlines = [
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in cluster.cells
|
||||
if len(cell.text.strip()) > 0
|
||||
]
|
||||
text = self.sanitize_text(textlines)
|
||||
text_el = TextElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text=text,
|
||||
page_no=page.page_no,
|
||||
cluster=cluster,
|
||||
)
|
||||
elements.append(text_el)
|
||||
for cluster in page.predictions.layout.clusters:
|
||||
# _log.info("Cluster label seen:", cluster.label)
|
||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||
|
||||
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
||||
headers.append(text_el)
|
||||
else:
|
||||
body.append(text_el)
|
||||
elif cluster.label == LayoutModel.TABLE_LABEL:
|
||||
tbl = None
|
||||
if page.predictions.tablestructure:
|
||||
tbl = page.predictions.tablestructure.table_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not tbl
|
||||
): # fallback: add table without structure, if it isn't present
|
||||
tbl = Table(
|
||||
textlines = [
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in cluster.cells
|
||||
if len(cell.text.strip()) > 0
|
||||
]
|
||||
text = self.sanitize_text(textlines)
|
||||
text_el = TextElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text="",
|
||||
otsl_seq=[],
|
||||
table_cells=[],
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
)
|
||||
|
||||
elements.append(tbl)
|
||||
body.append(tbl)
|
||||
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
||||
fig = None
|
||||
if page.predictions.figures_classification:
|
||||
fig = (
|
||||
page.predictions.figures_classification.figure_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
)
|
||||
if (
|
||||
not fig
|
||||
): # fallback: add figure without classification, if it isn't present
|
||||
fig = FigureElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text="",
|
||||
data=None,
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
)
|
||||
elements.append(fig)
|
||||
body.append(fig)
|
||||
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
||||
equation = None
|
||||
if page.predictions.equations_prediction:
|
||||
equation = (
|
||||
page.predictions.equations_prediction.equation_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
)
|
||||
if (
|
||||
not equation
|
||||
): # fallback: add empty formula, if it isn't present
|
||||
text = self.sanitize_text(
|
||||
[
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in cluster.cells
|
||||
if len(cell.text.strip()) > 0
|
||||
]
|
||||
)
|
||||
equation = TextElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
text=text,
|
||||
page_no=page.page_no,
|
||||
cluster=cluster,
|
||||
)
|
||||
elements.append(equation)
|
||||
body.append(equation)
|
||||
elements.append(text_el)
|
||||
|
||||
page.assembled = AssembledUnit(
|
||||
elements=elements, headers=headers, body=body
|
||||
)
|
||||
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
||||
headers.append(text_el)
|
||||
else:
|
||||
body.append(text_el)
|
||||
elif cluster.label == LayoutModel.TABLE_LABEL:
|
||||
tbl = None
|
||||
if page.predictions.tablestructure:
|
||||
tbl = page.predictions.tablestructure.table_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not tbl
|
||||
): # fallback: add table without structure, if it isn't present
|
||||
tbl = Table(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text="",
|
||||
otsl_seq=[],
|
||||
table_cells=[],
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
)
|
||||
|
||||
# Remove page images (can be disabled)
|
||||
if not self.options.keep_images:
|
||||
page._image_cache = {}
|
||||
elements.append(tbl)
|
||||
body.append(tbl)
|
||||
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
||||
fig = None
|
||||
if page.predictions.figures_classification:
|
||||
fig = page.predictions.figures_classification.figure_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not fig
|
||||
): # fallback: add figure without classification, if it isn't present
|
||||
fig = FigureElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text="",
|
||||
data=None,
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
)
|
||||
elements.append(fig)
|
||||
body.append(fig)
|
||||
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
||||
equation = None
|
||||
if page.predictions.equations_prediction:
|
||||
equation = page.predictions.equations_prediction.equation_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not equation
|
||||
): # fallback: add empty formula, if it isn't present
|
||||
text = self.sanitize_text(
|
||||
[
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in cluster.cells
|
||||
if len(cell.text.strip()) > 0
|
||||
]
|
||||
)
|
||||
equation = TextElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
text=text,
|
||||
)
|
||||
elements.append(equation)
|
||||
body.append(equation)
|
||||
|
||||
# Unload backend
|
||||
page._backend.unload()
|
||||
page.assembled = AssembledUnit(
|
||||
elements=elements, headers=headers, body=body
|
||||
)
|
||||
|
||||
# Remove page images (can be disabled)
|
||||
if not self.options.keep_images:
|
||||
page._image_cache = {}
|
||||
|
||||
# Unload backend
|
||||
page._backend.unload()
|
||||
|
||||
yield page
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
@@ -15,14 +19,17 @@ class PagePreprocessingModel(BasePageModel):
|
||||
def __init__(self, options: PagePreprocessingOptions):
|
||||
self.options = options
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
page = self._populate_page_images(page)
|
||||
page = self._parse_page_cells(page)
|
||||
with TimeRecorder(conv_res, "page_parse"):
|
||||
page = self._populate_page_images(page)
|
||||
page = self._parse_page_cells(conv_res, page)
|
||||
yield page
|
||||
|
||||
# Generate the page image and store it in the page object
|
||||
@@ -43,19 +50,30 @@ class PagePreprocessingModel(BasePageModel):
|
||||
return page
|
||||
|
||||
# Extract and populate the page cells and store it in the page object
|
||||
def _parse_page_cells(self, page: Page) -> Page:
|
||||
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
assert page._backend is not None
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells):
|
||||
def draw_text_boxes(image, cells, show: bool = False):
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
image.show()
|
||||
if show:
|
||||
image.show()
|
||||
else:
|
||||
out_path: Path = (
|
||||
Path(settings.debug.debug_output_path)
|
||||
/ f"debug_{conv_res.input.file.stem}"
|
||||
)
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
||||
out_file = out_path / f"cells_page_{page.page_no:05}.png"
|
||||
image.save(str(out_file), format="png")
|
||||
|
||||
if settings.debug.visualize_cells:
|
||||
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
||||
|
||||
return page
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import copy
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
@@ -8,8 +8,11 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class TableStructureModel(BasePageModel):
|
||||
@@ -35,7 +38,13 @@ class TableStructureModel(BasePageModel):
|
||||
self.tf_predictor = TFPredictor(self.tm_config)
|
||||
self.scale = 2.0 # Scale up table input images to 144 dpi
|
||||
|
||||
def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
|
||||
def draw_table_and_cells(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
page: Page,
|
||||
tbl_list: Iterable[Table],
|
||||
show: bool = False,
|
||||
):
|
||||
assert page._backend is not None
|
||||
|
||||
image = (
|
||||
@@ -61,9 +70,21 @@ class TableStructureModel(BasePageModel):
|
||||
fill="black",
|
||||
)
|
||||
|
||||
image.show()
|
||||
if show:
|
||||
image.show()
|
||||
else:
|
||||
out_path: Path = (
|
||||
Path(settings.debug.debug_output_path)
|
||||
/ f"debug_{conv_res.input.file.stem}"
|
||||
)
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
|
||||
image.save(str(out_file), format="png")
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
@@ -74,98 +95,112 @@ class TableStructureModel(BasePageModel):
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "table_structure"):
|
||||
|
||||
assert page.predictions.layout is not None
|
||||
assert page.size is not None
|
||||
assert page.predictions.layout is not None
|
||||
assert page.size is not None
|
||||
|
||||
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
||||
page.predictions.tablestructure = (
|
||||
TableStructurePrediction()
|
||||
) # dummy
|
||||
|
||||
in_tables = [
|
||||
(
|
||||
cluster,
|
||||
[
|
||||
round(cluster.bbox.l) * self.scale,
|
||||
round(cluster.bbox.t) * self.scale,
|
||||
round(cluster.bbox.r) * self.scale,
|
||||
round(cluster.bbox.b) * self.scale,
|
||||
],
|
||||
in_tables = [
|
||||
(
|
||||
cluster,
|
||||
[
|
||||
round(cluster.bbox.l) * self.scale,
|
||||
round(cluster.bbox.t) * self.scale,
|
||||
round(cluster.bbox.r) * self.scale,
|
||||
round(cluster.bbox.b) * self.scale,
|
||||
],
|
||||
)
|
||||
for cluster in page.predictions.layout.clusters
|
||||
if cluster.label == DocItemLabel.TABLE
|
||||
]
|
||||
if not len(in_tables):
|
||||
yield page
|
||||
continue
|
||||
|
||||
tokens = []
|
||||
for c in page.cells:
|
||||
for cluster, _ in in_tables:
|
||||
if c.bbox.area() > 0:
|
||||
if (
|
||||
c.bbox.intersection_area_with(cluster.bbox)
|
||||
/ c.bbox.area()
|
||||
> 0.2
|
||||
):
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
new_cell = copy.deepcopy(c)
|
||||
new_cell.bbox = new_cell.bbox.scaled(
|
||||
scale=self.scale
|
||||
)
|
||||
|
||||
tokens.append(new_cell.model_dump())
|
||||
|
||||
page_input = {
|
||||
"tokens": tokens,
|
||||
"width": page.size.width * self.scale,
|
||||
"height": page.size.height * self.scale,
|
||||
}
|
||||
page_input["image"] = numpy.asarray(
|
||||
page.get_image(scale=self.scale)
|
||||
)
|
||||
for cluster in page.predictions.layout.clusters
|
||||
if cluster.label == DocItemLabel.TABLE
|
||||
]
|
||||
if not len(in_tables):
|
||||
yield page
|
||||
continue
|
||||
|
||||
tokens = []
|
||||
for c in page.cells:
|
||||
for cluster, _ in in_tables:
|
||||
if c.bbox.area() > 0:
|
||||
if (
|
||||
c.bbox.intersection_area_with(cluster.bbox)
|
||||
/ c.bbox.area()
|
||||
> 0.2
|
||||
):
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
new_cell = copy.deepcopy(c)
|
||||
new_cell.bbox = new_cell.bbox.scaled(
|
||||
scale=self.scale
|
||||
table_clusters, table_bboxes = zip(*in_tables)
|
||||
|
||||
if len(table_bboxes):
|
||||
tf_output = self.tf_predictor.multi_table_predict(
|
||||
page_input, table_bboxes, do_matching=self.do_cell_matching
|
||||
)
|
||||
|
||||
for table_cluster, table_out in zip(table_clusters, tf_output):
|
||||
table_cells = []
|
||||
for element in table_out["tf_responses"]:
|
||||
|
||||
if not self.do_cell_matching:
|
||||
the_bbox = BoundingBox.model_validate(
|
||||
element["bbox"]
|
||||
).scaled(1 / self.scale)
|
||||
text_piece = page._backend.get_text_in_rect(
|
||||
the_bbox
|
||||
)
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tokens.append(new_cell.model_dump())
|
||||
tc = TableCell.model_validate(element)
|
||||
if self.do_cell_matching and tc.bbox is not None:
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
table_cells.append(tc)
|
||||
|
||||
page_input = {
|
||||
"tokens": tokens,
|
||||
"width": page.size.width * self.scale,
|
||||
"height": page.size.height * self.scale,
|
||||
}
|
||||
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
||||
# Retrieving cols/rows, after post processing:
|
||||
num_rows = table_out["predict_details"]["num_rows"]
|
||||
num_cols = table_out["predict_details"]["num_cols"]
|
||||
otsl_seq = table_out["predict_details"]["prediction"][
|
||||
"rs_seq"
|
||||
]
|
||||
|
||||
table_clusters, table_bboxes = zip(*in_tables)
|
||||
tbl = Table(
|
||||
otsl_seq=otsl_seq,
|
||||
table_cells=table_cells,
|
||||
num_rows=num_rows,
|
||||
num_cols=num_cols,
|
||||
id=table_cluster.id,
|
||||
page_no=page.page_no,
|
||||
cluster=table_cluster,
|
||||
label=DocItemLabel.TABLE,
|
||||
)
|
||||
|
||||
if len(table_bboxes):
|
||||
tf_output = self.tf_predictor.multi_table_predict(
|
||||
page_input, table_bboxes, do_matching=self.do_cell_matching
|
||||
)
|
||||
|
||||
for table_cluster, table_out in zip(table_clusters, tf_output):
|
||||
table_cells = []
|
||||
for element in table_out["tf_responses"]:
|
||||
|
||||
if not self.do_cell_matching:
|
||||
the_bbox = BoundingBox.model_validate(
|
||||
element["bbox"]
|
||||
).scaled(1 / self.scale)
|
||||
text_piece = page._backend.get_text_in_rect(the_bbox)
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tc = TableCell.model_validate(element)
|
||||
if self.do_cell_matching and tc.bbox is not None:
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
table_cells.append(tc)
|
||||
|
||||
# Retrieving cols/rows, after post processing:
|
||||
num_rows = table_out["predict_details"]["num_rows"]
|
||||
num_cols = table_out["predict_details"]["num_cols"]
|
||||
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
||||
|
||||
tbl = Table(
|
||||
otsl_seq=otsl_seq,
|
||||
table_cells=table_cells,
|
||||
num_rows=num_rows,
|
||||
num_cols=num_cols,
|
||||
id=table_cluster.id,
|
||||
page_no=page.page_no,
|
||||
cluster=table_cluster,
|
||||
label=DocItemLabel.TABLE,
|
||||
)
|
||||
|
||||
page.predictions.tablestructure.table_map[table_cluster.id] = (
|
||||
tbl
|
||||
)
|
||||
page.predictions.tablestructure.table_map[
|
||||
table_cluster.id
|
||||
] = tbl
|
||||
|
||||
# For debugging purposes:
|
||||
# self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
|
||||
if settings.debug.visualize_tables:
|
||||
self.draw_table_and_cells(
|
||||
conv_res,
|
||||
page,
|
||||
page.predictions.tablestructure.table_map.values(),
|
||||
)
|
||||
|
||||
yield page
|
||||
|
||||
@@ -8,8 +8,11 @@ import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
|
||||
return df_filtered
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w"
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
|
||||
# _log.info(df)
|
||||
|
||||
# Print relevant columns (bounding box and text)
|
||||
for ix, row in df.iterrows():
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
l = float(row["left"])
|
||||
b = float(row["top"])
|
||||
w = float(row["width"])
|
||||
h = float(row["height"])
|
||||
|
||||
t = b + h
|
||||
r = l + w
|
||||
|
||||
cell = OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=conf / 100.0,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w"
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
|
||||
# _log.info(df)
|
||||
|
||||
# Print relevant columns (bounding box and text)
|
||||
for ix, row in df.iterrows():
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
l = float(row["left"])
|
||||
b = float(row["top"])
|
||||
w = float(row["width"])
|
||||
h = float(row["height"])
|
||||
|
||||
t = b + h
|
||||
r = l + w
|
||||
|
||||
cell = OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=conf / 100.0,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||
if settings.debug.visualize_ocr:
|
||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||
|
||||
yield page
|
||||
|
||||
@@ -4,8 +4,11 @@ from typing import Iterable
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -61,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
# Finalize the tesseractAPI
|
||||
self.reader.End()
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
@@ -72,59 +77,66 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
assert self.reader is not None
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
assert self.reader is not None
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
# Retrieve text snippets with their bounding boxes
|
||||
self.reader.SetImage(high_res_image)
|
||||
boxes = self.reader.GetComponentImages(
|
||||
self.reader_RIL.TEXTLINE, True
|
||||
)
|
||||
|
||||
cells = []
|
||||
for ix, (im, box, _, _) in enumerate(boxes):
|
||||
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
||||
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
||||
|
||||
# Extract text within the bounding box
|
||||
text = self.reader.GetUTF8Text().strip()
|
||||
confidence = self.reader.MeanTextConf()
|
||||
left = box["x"] / self.scale
|
||||
bottom = box["y"] / self.scale
|
||||
right = (box["x"] + box["w"]) / self.scale
|
||||
top = (box["y"] + box["h"]) / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
# Retrieve text snippets with their bounding boxes
|
||||
self.reader.SetImage(high_res_image)
|
||||
boxes = self.reader.GetComponentImages(
|
||||
self.reader_RIL.TEXTLINE, True
|
||||
)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
cells = []
|
||||
for ix, (im, box, _, _) in enumerate(boxes):
|
||||
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
||||
self.reader.SetRectangle(
|
||||
box["x"], box["y"], box["w"], box["h"]
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
# Extract text within the bounding box
|
||||
text = self.reader.GetUTF8Text().strip()
|
||||
confidence = self.reader.MeanTextConf()
|
||||
left = box["x"] / self.scale
|
||||
bottom = box["y"] / self.scale
|
||||
right = (box["x"] + box["w"]) / self.scale
|
||||
top = (box["y"] + box["h"]) / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(
|
||||
all_ocr_cells, page.cells
|
||||
)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||
if settings.debug.visualize_ocr:
|
||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||
|
||||
yield page
|
||||
|
||||
Reference in New Issue
Block a user