Visualization codes output PNG to debug dir

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-29 13:53:29 +01:00
parent 0cdccb3da1
commit e1b83ec485
9 changed files with 91 additions and 22 deletions

View File

@ -36,7 +36,7 @@ class DebugSettings(BaseModel):
profile_pipeline_timings: bool = False profile_pipeline_timings: bool = False
# Path used to output debug information. # Path used to output debug information.
debug_output_path: str = str(Path.cwd()) debug_output_path: str = str(Path.cwd() / "debug")
class AppSettings(BaseSettings): class AppSettings(BaseSettings):

View File

@ -1,6 +1,7 @@
import copy import copy
import logging import logging
from abc import abstractmethod from abc import abstractmethod
from pathlib import Path
from typing import Iterable, List from typing import Iterable, List
import numpy as np import numpy as np
@ -12,6 +13,7 @@ from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -115,7 +117,7 @@ class BaseOcrModel(BasePageModel):
] ]
return filtered_ocr_cells return filtered_ocr_cells
def draw_ocr_rects_and_cells(self, page, ocr_rects): def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image, "RGBA") draw = ImageDraw.Draw(image, "RGBA")
@ -132,7 +134,18 @@ class BaseOcrModel(BasePageModel):
if isinstance(tc, OcrCell): if isinstance(tc, OcrCell):
color = "magenta" color = "magenta"
draw.rectangle([(x0, y0), (x1, y1)], outline=color) draw.rectangle([(x0, y0), (x1, y1)], outline=color)
image.show()
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
@abstractmethod @abstractmethod
def __call__( def __call__(

View File

@ -1,5 +1,6 @@
import copy import copy
import random import random
from pathlib import Path
from typing import List, Union from typing import List, Union
from deepsearch_glm.nlp_utils import init_nlp_model from deepsearch_glm.nlp_utils import init_nlp_model
@ -27,6 +28,7 @@ from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash from docling.utils.utils import create_hash
@ -236,15 +238,15 @@ class GlmModel:
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
# DEBUG code: # DEBUG code:
def draw_clusters_and_cells(ds_document, page_no): def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
clusters_to_draw = [] clusters_to_draw = []
image = copy.deepcopy(conv_res.pages[page_no].image) image = copy.deepcopy(conv_res.pages[page_no].image)
for ix, elem in enumerate(ds_document.main_text): for ix, elem in enumerate(ds_document.main_text):
if isinstance(elem, BaseText): if isinstance(elem, BaseText):
prov = elem.prov[0] prov = elem.prov[0] # type: ignore
elif isinstance(elem, Ref): elif isinstance(elem, Ref):
_, arr, index = elem.ref.split("/") _, arr, index = elem.ref.split("/")
index = int(index) index = int(index) # type: ignore
if arr == "tables": if arr == "tables":
prov = ds_document.tables[index].prov[0] prov = ds_document.tables[index].prov[0]
elif arr == "figures": elif arr == "figures":
@ -258,7 +260,7 @@ class GlmModel:
id=ix, id=ix,
label=elem.name, label=elem.name,
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
coord=prov.bbox, coord=prov.bbox, # type: ignore
origin=CoordOrigin.BOTTOMLEFT, origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(conv_res.pages[page_no].size.height), ).to_top_left_origin(conv_res.pages[page_no].size.height),
) )
@ -278,9 +280,21 @@ class GlmModel:
for tc in c.cells: # [:1]: for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple() x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells(ds_doc, 0) if show:
# draw_clusters_and_cells(exported_doc, 0) image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"doc_page_{page_no:05}.png"
image.save(str(out_file), format="png")
# for item in ds_doc.page_dimensions:
# page_no = item.page
# draw_clusters_and_cells(ds_doc, page_no)
return docling_doc return docling_doc

View File

@ -96,6 +96,6 @@ class EasyOcrModel(BaseOcrModel):
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(page, ocr_rects) self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page yield page

View File

@ -324,7 +324,7 @@ class LayoutModel(BasePageModel):
# clusters = self.sort_clusters_by_cell_order(clusters) # clusters = self.sort_clusters_by_cell_order(clusters)
# DEBUG code: # DEBUG code:
def draw_clusters_and_cells(show: bool = True): def draw_clusters_and_cells(show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)
if image is not None: if image is not None:
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
@ -344,6 +344,17 @@ class LayoutModel(BasePageModel):
) )
if show: if show:
image.show() image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = (
out_path / f"layout_page_{page.page_no:05}.png"
)
image.save(str(out_file), format="png")
# draw_clusters_and_cells() # draw_clusters_and_cells()

View File

@ -1,3 +1,4 @@
from pathlib import Path
from typing import Iterable, Optional from typing import Iterable, Optional
from PIL import ImageDraw from PIL import ImageDraw
@ -5,6 +6,7 @@ from pydantic import BaseModel
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
@ -27,7 +29,7 @@ class PagePreprocessingModel(BasePageModel):
else: else:
with TimeRecorder(conv_res, "page_parse"): with TimeRecorder(conv_res, "page_parse"):
page = self._populate_page_images(page) page = self._populate_page_images(page)
page = self._parse_page_cells(page) page = self._parse_page_cells(conv_res, page)
yield page yield page
# Generate the page image and store it in the page object # Generate the page image and store it in the page object
@ -48,19 +50,30 @@ class PagePreprocessingModel(BasePageModel):
return page return page
# Extract and populate the page cells and store it in the page object # Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, page: Page) -> Page: def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None assert page._backend is not None
page.cells = list(page._backend.get_text_cells()) page.cells = list(page._backend.get_text_cells())
# DEBUG code: # DEBUG code:
def draw_text_boxes(image, cells): def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
for c in cells: for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple() x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red") draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show() if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
# draw_text_boxes(page.get_image(scale=1.0), cells) out_file = out_path / f"cells_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
if settings.debug.visualize_cells:
draw_text_boxes(page.get_image(scale=1.0), page.cells)
return page return page

View File

@ -38,7 +38,13 @@ class TableStructureModel(BasePageModel):
self.tf_predictor = TFPredictor(self.tm_config) self.tf_predictor = TFPredictor(self.tm_config)
self.scale = 2.0 # Scale up table input images to 144 dpi self.scale = 2.0 # Scale up table input images to 144 dpi
def draw_table_and_cells(self, page: Page, tbl_list: Iterable[Table]): def draw_table_and_cells(
self,
conv_res: ConversionResult,
page: Page,
tbl_list: Iterable[Table],
show: bool = False,
):
assert page._backend is not None assert page._backend is not None
image = ( image = (
@ -64,7 +70,17 @@ class TableStructureModel(BasePageModel):
fill="black", fill="black",
) )
image.show() if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)
out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
@ -182,7 +198,9 @@ class TableStructureModel(BasePageModel):
# For debugging purposes: # For debugging purposes:
if settings.debug.visualize_tables: if settings.debug.visualize_tables:
self.draw_table_and_cells( self.draw_table_and_cells(
page, page.predictions.tablestructure.table_map.values() conv_res,
page,
page.predictions.tablestructure.table_map.values(),
) )
yield page yield page

View File

@ -179,6 +179,6 @@ class TesseractOcrCliModel(BaseOcrModel):
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(page, ocr_rects) self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page yield page

View File

@ -137,6 +137,6 @@ class TesseractOcrModel(BaseOcrModel):
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(page, ocr_rects) self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page yield page