mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
chore: Cleaning the example of post_process_ocr_with_vlm (#2693)
Cleaning the example Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
53
docs/examples/post_process_ocr_with_vlm.py
vendored
53
docs/examples/post_process_ocr_with_vlm.py
vendored
@@ -66,12 +66,10 @@ PRINT_RESULT_MARKDOWN = False
|
|||||||
|
|
||||||
def is_empty_fast_with_lines_pil(
|
def is_empty_fast_with_lines_pil(
|
||||||
pil_img: Image.Image,
|
pil_img: Image.Image,
|
||||||
# downscale_max_side: int = 64,
|
downscale_max_side: int = 48, # 64
|
||||||
downscale_max_side: int = 48,
|
|
||||||
grad_threshold: float = 15.0, # how strong a gradient must be to count as edge
|
grad_threshold: float = 15.0, # how strong a gradient must be to count as edge
|
||||||
min_line_coverage: float = 0.6, # line must cover 60% of height/width
|
min_line_coverage: float = 0.6, # line must cover 60% of height/width
|
||||||
# max_allowed_lines: int = 4, # allow up to this many strong lines
|
max_allowed_lines: int = 10, # allow up to this many strong lines (default 4)
|
||||||
max_allowed_lines: int = 10, # allow up to this many strong lines
|
|
||||||
edge_fraction_threshold: float = 0.0035,
|
edge_fraction_threshold: float = 0.0035,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -113,7 +111,6 @@ def is_empty_fast_with_lines_pil(
|
|||||||
gray, dtype=np.float32
|
gray, dtype=np.float32
|
||||||
) # shape (h, w) in PIL, but note: PIL size is (w, h)
|
) # shape (h, w) in PIL, but note: PIL size is (w, h)
|
||||||
H, W = arr.shape
|
H, W = arr.shape
|
||||||
# total_pixels = H * W
|
|
||||||
|
|
||||||
# 5) Compute simple gradients (forward differences)
|
# 5) Compute simple gradients (forward differences)
|
||||||
gx = np.zeros_like(arr)
|
gx = np.zeros_like(arr)
|
||||||
@@ -309,7 +306,6 @@ class PostOcrApiEnrichmentModel(
|
|||||||
page_ix
|
page_ix
|
||||||
].image.pil_image.crop(expanded_bbox.as_tuple())
|
].image.pil_image.crop(expanded_bbox.as_tuple())
|
||||||
|
|
||||||
# cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
|
|
||||||
is_empty, rem_frac, debug = is_empty_fast_with_lines_pil(
|
is_empty, rem_frac, debug = is_empty_fast_with_lines_pil(
|
||||||
cropped_image
|
cropped_image
|
||||||
)
|
)
|
||||||
@@ -319,8 +315,9 @@ class PostOcrApiEnrichmentModel(
|
|||||||
cropped_image.show()
|
cropped_image.show()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error with image: {e}")
|
print(f"Error with image: {e}")
|
||||||
print(f"!!! DETECTED EMPTY FORM ITEM IMAGE CROP !!! {rem_frac}")
|
print(
|
||||||
print(debug)
|
f"Detected empty form item image crop: {rem_frac} - {debug}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
result.append(
|
result.append(
|
||||||
PostOcrEnrichmentElement(item=c, image=[cropped_image])
|
PostOcrEnrichmentElement(item=c, image=[cropped_image])
|
||||||
@@ -340,20 +337,9 @@ class PostOcrApiEnrichmentModel(
|
|||||||
new_size=conv_res.document.pages[page_ix].image.size,
|
new_size=conv_res.document.pages[page_ix].image.size,
|
||||||
)
|
)
|
||||||
|
|
||||||
"""
|
|
||||||
expanded_bbox = bbox.expand_by_scale(
|
expanded_bbox = bbox.expand_by_scale(
|
||||||
x_scale=self.expansion_factor,
|
x_scale=self.table_cell_expansion_factor,
|
||||||
y_scale=self.expansion_factor,
|
y_scale=self.table_cell_expansion_factor,
|
||||||
).to_top_left_origin(
|
|
||||||
page_height=conv_res.document.pages[
|
|
||||||
page_ix
|
|
||||||
].image.size.height
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
|
|
||||||
expanded_bbox = bbox.expand_by_scale(
|
|
||||||
x_scale=0,
|
|
||||||
y_scale=0,
|
|
||||||
).to_top_left_origin(
|
).to_top_left_origin(
|
||||||
page_height=conv_res.document.pages[
|
page_height=conv_res.document.pages[
|
||||||
page_ix
|
page_ix
|
||||||
@@ -372,7 +358,6 @@ class PostOcrApiEnrichmentModel(
|
|||||||
page_ix
|
page_ix
|
||||||
].image.pil_image.crop(expanded_bbox.as_tuple())
|
].image.pil_image.crop(expanded_bbox.as_tuple())
|
||||||
|
|
||||||
# cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
|
|
||||||
is_empty, rem_frac, debug = (
|
is_empty, rem_frac, debug = (
|
||||||
is_empty_fast_with_lines_pil(cropped_image)
|
is_empty_fast_with_lines_pil(cropped_image)
|
||||||
)
|
)
|
||||||
@@ -383,9 +368,8 @@ class PostOcrApiEnrichmentModel(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error with image: {e}")
|
print(f"Error with image: {e}")
|
||||||
print(
|
print(
|
||||||
f"!!! DETECTED EMPTY TABLE CELL IMAGE CROP !!! {rem_frac}"
|
f"Detected empty table cell image crop: {rem_frac} - {debug}"
|
||||||
)
|
)
|
||||||
print(debug)
|
|
||||||
else:
|
else:
|
||||||
if SHOW_NONEMPTY_CROPS:
|
if SHOW_NONEMPTY_CROPS:
|
||||||
cropped_image.show()
|
cropped_image.show()
|
||||||
@@ -425,7 +409,6 @@ class PostOcrApiEnrichmentModel(
|
|||||||
cropped_image = conv_res.document.pages[
|
cropped_image = conv_res.document.pages[
|
||||||
page_ix
|
page_ix
|
||||||
].image.pil_image.crop(expanded_bbox.as_tuple())
|
].image.pil_image.crop(expanded_bbox.as_tuple())
|
||||||
# cropped_image = safe_crop(conv_res.document.pages[page_ix].image.pil_image, expanded_bbox.as_tuple())
|
|
||||||
|
|
||||||
is_empty, rem_frac, debug = is_empty_fast_with_lines_pil(
|
is_empty, rem_frac, debug = is_empty_fast_with_lines_pil(
|
||||||
cropped_image
|
cropped_image
|
||||||
@@ -436,15 +419,11 @@ class PostOcrApiEnrichmentModel(
|
|||||||
cropped_image.show()
|
cropped_image.show()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error with image: {e}")
|
print(f"Error with image: {e}")
|
||||||
print(f"!!! DETECTED EMPTY TEXT IMAGE CROP !!! {rem_frac}")
|
print(f"Detected empty text crop: {rem_frac} - {debug}")
|
||||||
print(debug)
|
|
||||||
else:
|
else:
|
||||||
multiple_crops.append(cropped_image)
|
multiple_crops.append(cropped_image)
|
||||||
print("")
|
|
||||||
print(f"cropped image size: {cropped_image.size}")
|
|
||||||
print(type(element))
|
|
||||||
if hasattr(element, "text"):
|
if hasattr(element, "text"):
|
||||||
print(f"OLD TEXT: {element.text}")
|
print(f"\nOLD TEXT: {element.text}")
|
||||||
else:
|
else:
|
||||||
print("Not a text element")
|
print("Not a text element")
|
||||||
if len(multiple_crops) > 0:
|
if len(multiple_crops) > 0:
|
||||||
@@ -471,7 +450,7 @@ class PostOcrApiEnrichmentModel(
|
|||||||
self.options = options
|
self.options = options
|
||||||
self.concurrency = 2
|
self.concurrency = 2
|
||||||
self.expansion_factor = 0.05
|
self.expansion_factor = 0.05
|
||||||
# self.expansion_factor = 0.0
|
self.table_cell_expansion_factor = 0.0 # do not modify table cell size
|
||||||
self.elements_batch_size = 4
|
self.elements_batch_size = 4
|
||||||
self._accelerator_options = accelerator_options
|
self._accelerator_options = accelerator_options
|
||||||
self._artifacts_path = (
|
self._artifacts_path = (
|
||||||
@@ -563,7 +542,7 @@ class PostOcrApiEnrichmentModel(
|
|||||||
if no_long_repeats(output, 50):
|
if no_long_repeats(output, 50):
|
||||||
if VERBOSE:
|
if VERBOSE:
|
||||||
if isinstance(item, (TextItem)):
|
if isinstance(item, (TextItem)):
|
||||||
print(f"OLD TEXT: {item.text}")
|
print(f"\nOLD TEXT: {item.text}")
|
||||||
|
|
||||||
# Re-populate text
|
# Re-populate text
|
||||||
if isinstance(item, (TextItem, GraphCell)):
|
if isinstance(item, (TextItem, GraphCell)):
|
||||||
@@ -642,7 +621,6 @@ def post_process_json(in_json: Path, out_final_json: Path):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# try:
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.JSON_DOCLING: FormatOption(
|
InputFormat.JSON_DOCLING: FormatOption(
|
||||||
@@ -660,8 +638,6 @@ def post_process_json(in_json: Path, out_final_json: Path):
|
|||||||
md = result.document.export_to_markdown()
|
md = result.document.export_to_markdown()
|
||||||
print("*** MARKDOWN ***")
|
print("*** MARKDOWN ***")
|
||||||
print(md)
|
print(md)
|
||||||
# except:
|
|
||||||
# print("ERROR IN OCR for: {}".format(in_json))
|
|
||||||
|
|
||||||
|
|
||||||
def process_pdf(pdf_path: Path, scratch_dir: Path, out_dir: Path):
|
def process_pdf(pdf_path: Path, scratch_dir: Path, out_dir: Path):
|
||||||
@@ -670,6 +646,7 @@ def process_pdf(pdf_path: Path, scratch_dir: Path, out_dir: Path):
|
|||||||
inter_json.parent.mkdir(parents=True, exist_ok=True)
|
inter_json.parent.mkdir(parents=True, exist_ok=True)
|
||||||
final_json.parent.mkdir(parents=True, exist_ok=True)
|
final_json.parent.mkdir(parents=True, exist_ok=True)
|
||||||
if final_json.exists() and final_json.stat().st_size > 0:
|
if final_json.exists() and final_json.stat().st_size > 0:
|
||||||
|
print(f"Result already found here: '{final_json}', aborting...")
|
||||||
return # already done
|
return # already done
|
||||||
convert_pdf(pdf_path, inter_json)
|
convert_pdf(pdf_path, inter_json)
|
||||||
post_process_json(inter_json, final_json)
|
post_process_json(inter_json, final_json)
|
||||||
@@ -708,7 +685,7 @@ def run_jsons(in_path: Path, out_dir: Path):
|
|||||||
jsons = sorted(in_path.glob("*.json"))
|
jsons = sorted(in_path.glob("*.json"))
|
||||||
if not jsons:
|
if not jsons:
|
||||||
raise SystemExit("Folder mode expects one or more .json files")
|
raise SystemExit("Folder mode expects one or more .json files")
|
||||||
# TODO: Look for ocr_documents.txt, in case found, respect only the jsons
|
# Look for ocr_documents.txt, in case found, respect only the jsons
|
||||||
filtered_jsons = filter_jsons_by_ocr_list(jsons, in_path)
|
filtered_jsons = filter_jsons_by_ocr_list(jsons, in_path)
|
||||||
for j in tqdm(filtered_jsons):
|
for j in tqdm(filtered_jsons):
|
||||||
print("")
|
print("")
|
||||||
@@ -740,6 +717,8 @@ def main():
|
|||||||
|
|
||||||
in_path = Path(args.in_path).expanduser().resolve()
|
in_path = Path(args.in_path).expanduser().resolve()
|
||||||
out_dir = Path(args.out_dir).expanduser().resolve()
|
out_dir = Path(args.out_dir).expanduser().resolve()
|
||||||
|
print(f"in_path: {in_path}")
|
||||||
|
print(f"out_dir: {out_dir}")
|
||||||
scratch_dir = out_dir / "temp"
|
scratch_dir = out_dir / "temp"
|
||||||
|
|
||||||
if not in_path.exists():
|
if not in_path.exists():
|
||||||
|
|||||||
Reference in New Issue
Block a user