chore: Cleaning the example of post_process_ocr_with_vlm (#2693)

Cleaning the example

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2025-11-27 12:38:45 +01:00
committed by GitHub
parent fa21128138
commit c0b57ae389

View File

@@ -66,12 +66,10 @@ PRINT_RESULT_MARKDOWN = False
def is_empty_fast_with_lines_pil( def is_empty_fast_with_lines_pil(
pil_img: Image.Image, pil_img: Image.Image,
# downscale_max_side: int = 64, downscale_max_side: int = 48, # 64
downscale_max_side: int = 48,
grad_threshold: float = 15.0, # how strong a gradient must be to count as edge grad_threshold: float = 15.0, # how strong a gradient must be to count as edge
min_line_coverage: float = 0.6, # line must cover 60% of height/width min_line_coverage: float = 0.6, # line must cover 60% of height/width
# max_allowed_lines: int = 4, # allow up to this many strong lines max_allowed_lines: int = 10, # allow up to this many strong lines (default 4)
max_allowed_lines: int = 10, # allow up to this many strong lines
edge_fraction_threshold: float = 0.0035, edge_fraction_threshold: float = 0.0035,
): ):
""" """
@@ -113,7 +111,6 @@ def is_empty_fast_with_lines_pil(
gray, dtype=np.float32 gray, dtype=np.float32
) # shape (h, w) in PIL, but note: PIL size is (w, h) ) # shape (h, w) in PIL, but note: PIL size is (w, h)
H, W = arr.shape H, W = arr.shape
# total_pixels = H * W
# 5) Compute simple gradients (forward differences) # 5) Compute simple gradients (forward differences)
gx = np.zeros_like(arr) gx = np.zeros_like(arr)
@@ -309,7 +306,6 @@ class PostOcrApiEnrichmentModel(
page_ix page_ix
].image.pil_image.crop(expanded_bbox.as_tuple()) ].image.pil_image.crop(expanded_bbox.as_tuple())
# cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
is_empty, rem_frac, debug = is_empty_fast_with_lines_pil( is_empty, rem_frac, debug = is_empty_fast_with_lines_pil(
cropped_image cropped_image
) )
@@ -319,8 +315,9 @@ class PostOcrApiEnrichmentModel(
cropped_image.show() cropped_image.show()
except Exception as e: except Exception as e:
print(f"Error with image: {e}") print(f"Error with image: {e}")
print(f"!!! DETECTED EMPTY FORM ITEM IMAGE CROP !!! {rem_frac}") print(
print(debug) f"Detected empty form item image crop: {rem_frac} - {debug}"
)
else: else:
result.append( result.append(
PostOcrEnrichmentElement(item=c, image=[cropped_image]) PostOcrEnrichmentElement(item=c, image=[cropped_image])
@@ -340,20 +337,9 @@ class PostOcrApiEnrichmentModel(
new_size=conv_res.document.pages[page_ix].image.size, new_size=conv_res.document.pages[page_ix].image.size,
) )
"""
expanded_bbox = bbox.expand_by_scale( expanded_bbox = bbox.expand_by_scale(
x_scale=self.expansion_factor, x_scale=self.table_cell_expansion_factor,
y_scale=self.expansion_factor, y_scale=self.table_cell_expansion_factor,
).to_top_left_origin(
page_height=conv_res.document.pages[
page_ix
].image.size.height
)
"""
expanded_bbox = bbox.expand_by_scale(
x_scale=0,
y_scale=0,
).to_top_left_origin( ).to_top_left_origin(
page_height=conv_res.document.pages[ page_height=conv_res.document.pages[
page_ix page_ix
@@ -372,7 +358,6 @@ class PostOcrApiEnrichmentModel(
page_ix page_ix
].image.pil_image.crop(expanded_bbox.as_tuple()) ].image.pil_image.crop(expanded_bbox.as_tuple())
# cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
is_empty, rem_frac, debug = ( is_empty, rem_frac, debug = (
is_empty_fast_with_lines_pil(cropped_image) is_empty_fast_with_lines_pil(cropped_image)
) )
@@ -383,9 +368,8 @@ class PostOcrApiEnrichmentModel(
except Exception as e: except Exception as e:
print(f"Error with image: {e}") print(f"Error with image: {e}")
print( print(
f"!!! DETECTED EMPTY TABLE CELL IMAGE CROP !!! {rem_frac}" f"Detected empty table cell image crop: {rem_frac} - {debug}"
) )
print(debug)
else: else:
if SHOW_NONEMPTY_CROPS: if SHOW_NONEMPTY_CROPS:
cropped_image.show() cropped_image.show()
@@ -425,7 +409,6 @@ class PostOcrApiEnrichmentModel(
cropped_image = conv_res.document.pages[ cropped_image = conv_res.document.pages[
page_ix page_ix
].image.pil_image.crop(expanded_bbox.as_tuple()) ].image.pil_image.crop(expanded_bbox.as_tuple())
# cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
is_empty, rem_frac, debug = is_empty_fast_with_lines_pil( is_empty, rem_frac, debug = is_empty_fast_with_lines_pil(
cropped_image cropped_image
@@ -436,15 +419,11 @@ class PostOcrApiEnrichmentModel(
cropped_image.show() cropped_image.show()
except Exception as e: except Exception as e:
print(f"Error with image: {e}") print(f"Error with image: {e}")
print(f"!!! DETECTED EMPTY TEXT IMAGE CROP !!! {rem_frac}") print(f"Detected empty text crop: {rem_frac} - {debug}")
print(debug)
else: else:
multiple_crops.append(cropped_image) multiple_crops.append(cropped_image)
print("")
print(f"cropped image size: {cropped_image.size}")
print(type(element))
if hasattr(element, "text"): if hasattr(element, "text"):
print(f"OLD TEXT: {element.text}") print(f"\nOLD TEXT: {element.text}")
else: else:
print("Not a text element") print("Not a text element")
if len(multiple_crops) > 0: if len(multiple_crops) > 0:
@@ -471,7 +450,7 @@ class PostOcrApiEnrichmentModel(
self.options = options self.options = options
self.concurrency = 2 self.concurrency = 2
self.expansion_factor = 0.05 self.expansion_factor = 0.05
# self.expansion_factor = 0.0 self.table_cell_expansion_factor = 0.0 # do not modify table cell size
self.elements_batch_size = 4 self.elements_batch_size = 4
self._accelerator_options = accelerator_options self._accelerator_options = accelerator_options
self._artifacts_path = ( self._artifacts_path = (
@@ -563,7 +542,7 @@ class PostOcrApiEnrichmentModel(
if no_long_repeats(output, 50): if no_long_repeats(output, 50):
if VERBOSE: if VERBOSE:
if isinstance(item, (TextItem)): if isinstance(item, (TextItem)):
print(f"OLD TEXT: {item.text}") print(f"\nOLD TEXT: {item.text}")
# Re-populate text # Re-populate text
if isinstance(item, (TextItem, GraphCell)): if isinstance(item, (TextItem, GraphCell)):
@@ -642,7 +621,6 @@ def post_process_json(in_json: Path, out_final_json: Path):
) )
) )
# try:
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
format_options={ format_options={
InputFormat.JSON_DOCLING: FormatOption( InputFormat.JSON_DOCLING: FormatOption(
@@ -660,8 +638,6 @@ def post_process_json(in_json: Path, out_final_json: Path):
md = result.document.export_to_markdown() md = result.document.export_to_markdown()
print("*** MARKDOWN ***") print("*** MARKDOWN ***")
print(md) print(md)
# except:
# print("ERROR IN OCR for: {}".format(in_json))
def process_pdf(pdf_path: Path, scratch_dir: Path, out_dir: Path): def process_pdf(pdf_path: Path, scratch_dir: Path, out_dir: Path):
@@ -670,6 +646,7 @@ def process_pdf(pdf_path: Path, scratch_dir: Path, out_dir: Path):
inter_json.parent.mkdir(parents=True, exist_ok=True) inter_json.parent.mkdir(parents=True, exist_ok=True)
final_json.parent.mkdir(parents=True, exist_ok=True) final_json.parent.mkdir(parents=True, exist_ok=True)
if final_json.exists() and final_json.stat().st_size > 0: if final_json.exists() and final_json.stat().st_size > 0:
print(f"Result already found here: '{final_json}', aborting...")
return # already done return # already done
convert_pdf(pdf_path, inter_json) convert_pdf(pdf_path, inter_json)
post_process_json(inter_json, final_json) post_process_json(inter_json, final_json)
@@ -708,7 +685,7 @@ def run_jsons(in_path: Path, out_dir: Path):
jsons = sorted(in_path.glob("*.json")) jsons = sorted(in_path.glob("*.json"))
if not jsons: if not jsons:
raise SystemExit("Folder mode expects one or more .json files") raise SystemExit("Folder mode expects one or more .json files")
# TODO: Look for ocr_documents.txt, in case found, respect only the jsons # Look for ocr_documents.txt, in case found, respect only the jsons
filtered_jsons = filter_jsons_by_ocr_list(jsons, in_path) filtered_jsons = filter_jsons_by_ocr_list(jsons, in_path)
for j in tqdm(filtered_jsons): for j in tqdm(filtered_jsons):
print("") print("")
@@ -740,6 +717,8 @@ def main():
in_path = Path(args.in_path).expanduser().resolve() in_path = Path(args.in_path).expanduser().resolve()
out_dir = Path(args.out_dir).expanduser().resolve() out_dir = Path(args.out_dir).expanduser().resolve()
print(f"in_path: {in_path}")
print(f"out_dir: {out_dir}")
scratch_dir = out_dir / "temp" scratch_dir = out_dir / "temp"
if not in_path.exists(): if not in_path.exists():