docs: Example on how to apply external OCR as post processing (#2517)

* Example on how to apply to Docling Document OCR as a post-processing with "nanonets-ocr2-3b" via LM Studio Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added support of elements with multiple provenances Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * improved prompt for nanonets-ocr2-3b Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * excluded example from CI Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * updated class name Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Improved usability of the example, added simple cli, and some helper functions Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fix api_image_request usage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix pydantic errors Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Improvements and corrections Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added string sanitation, removing break lines from remote OCR, also preserving original text from json Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added quick and reliable detection of empty image crops (elements, table cells, form items), these are not sent to OCR Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Example respects ocr_documents.txt, tuned empty crop detection Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning api_image_request Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-11 22:28:31 +00:00 · 2025-11-27 11:04:40 +01:00
parent 0049857c7d
commit fa21128138
3 changed files with 821 additions and 40 deletions
--- a/docling/utils/api_image_request.py
+++ b/docling/utils/api_image_request.py
@@ -23,51 +23,72 @@ def api_image_request(
    **params,
 ) -> Tuple[str, Optional[int], VlmStopReason]:
    img_io = BytesIO()
-    image.save(img_io, "PNG")
-    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
-    messages = [
-        {
-            "role": "user",
-            "content": [
+    image = (
+        image.copy()
+    )  # Fix for inconsistent PIL image width/height to actual byte data
+    image = image.convert("RGBA")
+    good_image = True
+    try:
+        image.save(img_io, "PNG")
+    except Exception as e:
+        good_image = False
+        _log.error(f"Error, corrupter PNG of size: {image.size}: {e}")
+
+    if good_image:
+        try:
+            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+            messages = [
                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
-                },
-                {
-                    "type": "text",
-                    "text": prompt,
-                },
-            ],
-        }
-    ]
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}"
+                            },
+                        },
+                        {
+                            "type": "text",
+                            "text": prompt,
+                        },
+                    ],
+                }
+            ]

-    payload = {
-        "messages": messages,
-        **params,
-    }
+            payload = {
+                "messages": messages,
+                **params,
+            }

-    headers = headers or {}
+            headers = headers or {}

-    r = requests.post(
-        str(url),
-        headers=headers,
-        json=payload,
-        timeout=timeout,
-    )
-    if not r.ok:
-        _log.error(f"Error calling the API. Response was {r.text}")
-    r.raise_for_status()
+            r = requests.post(
+                str(url),
+                headers=headers,
+                json=payload,
+                timeout=timeout,
+            )
+            if not r.ok:
+                _log.error(f"Error calling the API. Response was {r.text}")
+                # image.show()
+            # r.raise_for_status()

-    api_resp = OpenAiApiResponse.model_validate_json(r.text)
-    generated_text = api_resp.choices[0].message.content.strip()
-    num_tokens = api_resp.usage.total_tokens
-    stop_reason = (
-        VlmStopReason.LENGTH
-        if api_resp.choices[0].finish_reason == "length"
-        else VlmStopReason.END_OF_SEQUENCE
-    )
+            api_resp = OpenAiApiResponse.model_validate_json(r.text)
+            generated_text = api_resp.choices[0].message.content.strip()
+            num_tokens = api_resp.usage.total_tokens
+            stop_reason = (
+                VlmStopReason.LENGTH
+                if api_resp.choices[0].finish_reason == "length"
+                else VlmStopReason.END_OF_SEQUENCE
+            )

-    return generated_text, num_tokens, stop_reason
+            return generated_text, num_tokens, stop_reason
+        except Exception as e:
+            _log.error(f"Error, could not process request: {e}")
+            return "", 0, VlmStopReason.UNSPECIFIED
+    else:
+        return "", 0, VlmStopReason.UNSPECIFIED


 def api_image_request_streaming(