docs: Example on how to apply external OCR as post processing (#2517)

* Example on how to apply to Docling Document OCR as a post-processing with "nanonets-ocr2-3b" via LM Studio

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Added support of elements with multiple provenances

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* cleaning up

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* improved prompt for nanonets-ocr2-3b

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* cleaning up

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* excluded example from CI

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* updated class name

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Improved usability of the example, added simple cli, and some helper functions

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Fix api_image_request usage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix pydantic errors

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Improvements and corrections

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Added string sanitation, removing break lines from remote OCR, also preserving original text from json

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Added quick and reliable detection of empty image crops (elements, table cells, form items), these are not sent to OCR

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Example respects ocr_documents.txt, tuned empty crop detection

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* cleaning api_image_request

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2025-11-27 11:04:40 +01:00
committed by GitHub
parent 0049857c7d
commit fa21128138
3 changed files with 821 additions and 40 deletions

View File

@@ -23,51 +23,72 @@ def api_image_request(
**params,
) -> Tuple[str, Optional[int], VlmStopReason]:
img_io = BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
image = (
image.copy()
) # Fix for inconsistent PIL image width/height to actual byte data
image = image.convert("RGBA")
good_image = True
try:
image.save(img_io, "PNG")
except Exception as e:
good_image = False
_log.error(f"Error, corrupter PNG of size: {image.size}: {e}")
if good_image:
try:
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
{
"type": "text",
"text": prompt,
},
],
}
]
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
{
"type": "text",
"text": prompt,
},
],
}
]
payload = {
"messages": messages,
**params,
}
payload = {
"messages": messages,
**params,
}
headers = headers or {}
headers = headers or {}
r = requests.post(
str(url),
headers=headers,
json=payload,
timeout=timeout,
)
if not r.ok:
_log.error(f"Error calling the API. Response was {r.text}")
r.raise_for_status()
r = requests.post(
str(url),
headers=headers,
json=payload,
timeout=timeout,
)
if not r.ok:
_log.error(f"Error calling the API. Response was {r.text}")
# image.show()
# r.raise_for_status()
api_resp = OpenAiApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
num_tokens = api_resp.usage.total_tokens
stop_reason = (
VlmStopReason.LENGTH
if api_resp.choices[0].finish_reason == "length"
else VlmStopReason.END_OF_SEQUENCE
)
api_resp = OpenAiApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
num_tokens = api_resp.usage.total_tokens
stop_reason = (
VlmStopReason.LENGTH
if api_resp.choices[0].finish_reason == "length"
else VlmStopReason.END_OF_SEQUENCE
)
return generated_text, num_tokens, stop_reason
return generated_text, num_tokens, stop_reason
except Exception as e:
_log.error(f"Error, could not process request: {e}")
return "", 0, VlmStopReason.UNSPECIFIED
else:
return "", 0, VlmStopReason.UNSPECIFIED
def api_image_request_streaming(