fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro 2025-03-14 14:31:24 +01:00
parent 98b5eeb844
commit 7a3ef336fd
35 changed files with 224 additions and 71 deletions

View File

@ -6,10 +6,9 @@ import tempfile
from collections.abc import Iterable
from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen
from typing import List, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type, cast
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling.datamodel.base_models import Page
@ -21,7 +20,12 @@ from docling.datamodel.pipeline_options import (
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.ocr_utils import (
Box,
map_tesseract_script,
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@ -93,14 +97,13 @@ class TesseractOcrCliModel(BaseOcrModel):
return name, version
def _run_tesseract(self, ifilename: str):
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
r"""
Run tesseract CLI
"""
cmd = [self.options.tesseract_cmd]
if "auto" in self.options.lang:
lang = self._detect_language(ifilename)
lang = self._parse_language(osd)
if lang is not None:
cmd.append("-l")
cmd.append(lang)
@ -139,11 +142,10 @@ class TesseractOcrCliModel(BaseOcrModel):
return df_filtered
def _detect_language(self, ifilename: str):
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
r"""
Run tesseract in PSM 0 mode to detect the language
"""
assert self._tesseract_languages is not None
cmd = [self.options.tesseract_cmd]
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
@ -154,7 +156,11 @@ class TesseractOcrCliModel(BaseOcrModel):
df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
return df_detected
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
assert self._tesseract_languages is not None
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None
@ -225,8 +231,14 @@ class TesseractOcrCliModel(BaseOcrModel):
) as image_file:
fname = image_file.name
high_res_image.save(image_file)
df_result = self._run_tesseract(fname)
df_osd = self._perform_osd(fname)
doc_orientation = _parse_orientation(df_osd)
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
doc_orientation, expand=True
)
high_res_image.save(fname)
df_result = self._run_tesseract(fname, df_osd)
finally:
if os.path.exists(fname):
os.remove(fname)
@ -238,13 +250,22 @@ class TesseractOcrCliModel(BaseOcrModel):
text = row["text"]
conf = row["conf"]
l = float(row["left"]) # noqa: E741
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
rotated_bbox = (
row["left"],
row["top"],
row["width"],
row["height"],
)
rotated_bbox = cast(
Box, tuple(float(c) for c in rotated_bbox)
)
rect = tesseract_box_to_bounding_rectangle(
rotated_bbox,
offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
rotated_image_size=high_res_image.size,
)
cell = TextCell(
index=ix,
@ -252,17 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
orig=str(text),
from_ocr=True,
confidence=conf / 100.0,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
)
),
rect=rect,
)
all_ocr_cells.append(cell)
@ -278,3 +289,9 @@ class TesseractOcrCliModel(BaseOcrModel):
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return TesseractCliOcrOptions
def _parse_orientation(df_osd: pd.DataFrame) -> int:
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
orientation = parse_tesseract_orientation(orientations[0].strip())
return orientation

View File

@ -3,10 +3,9 @@ from __future__ import annotations
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Optional, Type
from typing import Dict, Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.ocr_utils import (
map_tesseract_script,
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
else:
self.reader = tesserocr.PyTessBaseAPI(
**{"lang": lang} | tesserocr_kwargs,
)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
self.reader_RIL = tesserocr.RIL
def __del__(self):
@ -125,6 +128,7 @@ class TesseractOcrModel(BaseOcrModel):
else:
with TimeRecorder(conv_res, "ocr"):
assert self.reader is not None
assert self.osd_reader is not None
assert self._tesserocr_languages is not None
ocr_rects = self.get_ocr_rects(page)
@ -139,16 +143,17 @@ class TesseractOcrModel(BaseOcrModel):
)
local_reader = self.reader
if "auto" in self.options.lang:
assert self.osd_reader is not None
self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript()
# No text, probably
if osd is None:
continue
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
doc_orientation, expand=True
)
if "auto" in self.options.lang:
script = osd["script_name"]
script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"
@ -188,11 +193,14 @@ class TesseractOcrModel(BaseOcrModel):
# Extract text within the bounding box
text = local_reader.GetUTF8Text().strip()
confidence = local_reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
rotated_bbox = (box["x"], box["y"], box["w"], box["h"])
rect = tesseract_box_to_bounding_rectangle(
rotated_bbox,
offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
rotated_image_size=high_res_image.size,
)
cells.append(
TextCell(
index=ix,
@ -200,12 +208,7 @@ class TesseractOcrModel(BaseOcrModel):
orig=text,
from_ocr=True,
confidence=confidence,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
),
rect=rect,
)
)

View File

@ -1,3 +1,15 @@
from typing import Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle
_TESSERACT_ORIENTATIONS = {0, 90, 180, 270}
Point = Tuple[float, float]
Box = Tuple[float, float, float, float]
Size = Tuple[int, int]
def map_tesseract_script(script: str) -> str:
r""" """
if script == "Katakana" or script == "Hiragana":
@ -7,3 +19,75 @@ def map_tesseract_script(script: str) -> str:
elif script == "Korean":
script = "Hangul"
return script
def reverse_tesseract_preprocessing_rotation(
box: Box, orientation: int, rotated_im_size: Size
) -> tuple[Point, Point, Point, Point]:
l, t, w, h = box
rotated_w, rotated_h = rotated_im_size
if orientation == 0:
return (l, t), (l + w, t), (l + w, t + h), (l, t + h)
if orientation == 90:
x0 = rotated_h - t
y0 = l
return (x0, y0), (x0, y0 + w), (x0 - h, y0 + w), (x0 - h, y0)
if orientation == 180:
x0 = rotated_w - l
y0 = rotated_h - t
return (x0, y0), (x0 - w, y0), (x0 - w, y0 - h), (x0, y0 - h)
if orientation == 270:
x0 = t
y0 = rotated_w - l
return (x0, y0), (x0, y0 - w), (x0 + h, y0 - w), (x0 + h, y0)
msg = (
f"invalid tesseract document orientation {orientation}, "
f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}"
)
raise ValueError(msg)
def parse_tesseract_orientation(orientation: str) -> int:
parsed = int(orientation)
if parsed not in _TESSERACT_ORIENTATIONS:
msg = (
f"invalid tesseract document orientation {orientation}, "
f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}"
)
raise ValueError(msg)
return parsed
def tesseract_box_to_bounding_rectangle(
box: Box,
*,
offset: Optional[BoundingBox] = None,
scale: float,
orientation: int,
rotated_image_size: Size,
) -> BoundingRectangle:
# box is in the top, left, height, width format + top left orientation
r_0, r_1, r_2, r_3 = reverse_tesseract_preprocessing_rotation(
box, orientation, rotated_image_size
)
rect = BoundingRectangle(
r_x0=r_0[0] / scale,
r_y0=r_0[1] / scale,
r_x1=r_1[0] / scale,
r_y1=r_1[1] / scale,
r_x2=r_2[0] / scale,
r_y2=r_2[1] / scale,
r_x3=r_3[0] / scale,
r_y3=r_3[1] / scale,
coord_origin=CoordOrigin.TOPLEFT,
)
if offset is not None:
rect.r_x0 += offset.l
rect.r_x1 += offset.l
rect.r_x2 += offset.l
rect.r_x3 += offset.l
rect.r_y0 += offset.t
rect.r_y1 += offset.t
rect.r_y2 += offset.t
rect.r_y3 += offset.t
return rect

View File

@ -0,0 +1,3 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
</document>

View File

@ -0,0 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -0,0 +1 @@
package

View File

@ -0,0 +1 @@
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]

View File

@ -0,0 +1,4 @@
<document>
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
</document>

View File

@ -0,0 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 523.0, 123.25502522786462], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -0,0 +1,3 @@
package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,3 @@
<document>
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
</document>

View File

@ -0,0 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_270.pdf", "filename-prov": null, "document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [691.6666666666666, 444.53450520833337, 710.3333333333334, 521.5345052083334], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -0,0 +1 @@
package

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,3 @@
<document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
</document>

View File

@ -0,0 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_90.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.66666666666666, 73.53450520833337, 150.33333333333334, 150.53450520833331], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -0,0 +1 @@
package

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,3 @@
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
</doctag>

View File

@ -0,0 +1 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@ -0,0 +1,3 @@
package
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,3 @@
<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_411><loc_62><loc_422><loc_127>package</text>
</doctag>

View File

@ -0,0 +1 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_270", "origin": {"mimetype": "application/pdf", "binary_hash": 10890858393843077593, "filename": "ocr_test_rotated_270.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 718.6666666666666, "t": 522.8678385416666, "r": 764.0, "b": 91.86783854166669, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 691.6666666666666, "t": 521.5345052083334, "r": 710.3333333333334, "b": 444.53450520833337, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}}

View File

@ -0,0 +1 @@
package

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,3 @@
<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_78><loc_374><loc_89><loc_438>package</text>
</doctag>

View File

@ -0,0 +1 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_90", "origin": {"mimetype": "application/pdf", "binary_hash": 6989291015361162334, "filename": "ocr_test_rotated_90.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 78.0, "t": 503.201171875, "r": 123.33333333333333, "b": 72.201171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 131.66666666666666, "t": 150.53450520833331, "r": 150.33333333333334, "b": 73.53450520833337, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}}

View File

@ -0,0 +1 @@
package

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,6 +1,6 @@
import sys
from pathlib import Path
from typing import List
from typing import List, Tuple
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
@ -56,33 +56,35 @@ def get_converter(ocr_options: OcrOptions):
def test_e2e_conversions():
pdf_paths = get_pdf_paths()
engines: List[OcrOptions] = [
EasyOcrOptions(),
TesseractOcrOptions(),
TesseractCliOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
engines: List[Tuple[OcrOptions, bool]] = [
(EasyOcrOptions(), False),
(TesseractOcrOptions(), True),
(TesseractCliOcrOptions(), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
(TesseractOcrOptions(force_full_page_ocr=True), True),
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
]
# rapidocr is only available for Python >=3.6,<3.13
if sys.version_info < (3, 13):
engines.append(RapidOcrOptions())
engines.append(RapidOcrOptions(force_full_page_ocr=True))
engines.append((RapidOcrOptions(), False))
engines.append((RapidOcrOptions(force_full_page_ocr=True), False))
# only works on mac
if "darwin" == sys.platform:
engines.append(OcrMacOptions())
engines.append(OcrMacOptions(force_full_page_ocr=True))
engines.append((OcrMacOptions(), True))
engines.append((OcrMacOptions(force_full_page_ocr=True), True))
for ocr_options in engines:
for ocr_options, supports_rotation in engines:
print(
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
)
converter = get_converter(ocr_options=ocr_options)
for pdf_path in pdf_paths:
if not supports_rotation and "rotated" in pdf_path.name:
continue
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert(pdf_path)