mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 22:28:31 +00:00
docs: Example on how to apply external OCR as post processing (#2517)
* Example on how to apply to Docling Document OCR as a post-processing with "nanonets-ocr2-3b" via LM Studio Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added support of elements with multiple provenances Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * improved prompt for nanonets-ocr2-3b Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * excluded example from CI Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * updated class name Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Improved usability of the example, added simple cli, and some helper functions Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fix api_image_request usage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix pydantic errors Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Improvements and corrections Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added string sanitation, removing break lines from remote OCR, also preserving original text from json Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added quick and reliable detection of empty image crops (elements, table cells, form items), these are not sent to OCR Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Example respects ocr_documents.txt, tuned empty crop detection Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaning api_image_request Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -23,51 +23,72 @@ def api_image_request(
|
||||
**params,
|
||||
) -> Tuple[str, Optional[int], VlmStopReason]:
|
||||
img_io = BytesIO()
|
||||
image.save(img_io, "PNG")
|
||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
image = (
|
||||
image.copy()
|
||||
) # Fix for inconsistent PIL image width/height to actual byte data
|
||||
image = image.convert("RGBA")
|
||||
good_image = True
|
||||
try:
|
||||
image.save(img_io, "PNG")
|
||||
except Exception as e:
|
||||
good_image = False
|
||||
_log.error(f"Error, corrupter PNG of size: {image.size}: {e}")
|
||||
|
||||
if good_image:
|
||||
try:
|
||||
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{image_base64}"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
payload = {
|
||||
"messages": messages,
|
||||
**params,
|
||||
}
|
||||
payload = {
|
||||
"messages": messages,
|
||||
**params,
|
||||
}
|
||||
|
||||
headers = headers or {}
|
||||
headers = headers or {}
|
||||
|
||||
r = requests.post(
|
||||
str(url),
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=timeout,
|
||||
)
|
||||
if not r.ok:
|
||||
_log.error(f"Error calling the API. Response was {r.text}")
|
||||
r.raise_for_status()
|
||||
r = requests.post(
|
||||
str(url),
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=timeout,
|
||||
)
|
||||
if not r.ok:
|
||||
_log.error(f"Error calling the API. Response was {r.text}")
|
||||
# image.show()
|
||||
# r.raise_for_status()
|
||||
|
||||
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
||||
generated_text = api_resp.choices[0].message.content.strip()
|
||||
num_tokens = api_resp.usage.total_tokens
|
||||
stop_reason = (
|
||||
VlmStopReason.LENGTH
|
||||
if api_resp.choices[0].finish_reason == "length"
|
||||
else VlmStopReason.END_OF_SEQUENCE
|
||||
)
|
||||
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
||||
generated_text = api_resp.choices[0].message.content.strip()
|
||||
num_tokens = api_resp.usage.total_tokens
|
||||
stop_reason = (
|
||||
VlmStopReason.LENGTH
|
||||
if api_resp.choices[0].finish_reason == "length"
|
||||
else VlmStopReason.END_OF_SEQUENCE
|
||||
)
|
||||
|
||||
return generated_text, num_tokens, stop_reason
|
||||
return generated_text, num_tokens, stop_reason
|
||||
except Exception as e:
|
||||
_log.error(f"Error, could not process request: {e}")
|
||||
return "", 0, VlmStopReason.UNSPECIFIED
|
||||
else:
|
||||
return "", 0, VlmStopReason.UNSPECIFIED
|
||||
|
||||
|
||||
def api_image_request_streaming(
|
||||
|
||||
Reference in New Issue
Block a user