mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
fix(ocr): tesseract support mis-oriented documents
Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
parent
98b5eeb844
commit
7a3ef336fd
@ -6,10 +6,9 @@ import tempfile
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from subprocess import DEVNULL, PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type, cast
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
@ -21,7 +20,12 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.ocr_utils import map_tesseract_script
|
from docling.utils.ocr_utils import (
|
||||||
|
Box,
|
||||||
|
map_tesseract_script,
|
||||||
|
parse_tesseract_orientation,
|
||||||
|
tesseract_box_to_bounding_rectangle,
|
||||||
|
)
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -93,14 +97,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
|
|
||||||
return name, version
|
return name, version
|
||||||
|
|
||||||
def _run_tesseract(self, ifilename: str):
|
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
|
||||||
r"""
|
r"""
|
||||||
Run tesseract CLI
|
Run tesseract CLI
|
||||||
"""
|
"""
|
||||||
cmd = [self.options.tesseract_cmd]
|
cmd = [self.options.tesseract_cmd]
|
||||||
|
|
||||||
if "auto" in self.options.lang:
|
if "auto" in self.options.lang:
|
||||||
lang = self._detect_language(ifilename)
|
lang = self._parse_language(osd)
|
||||||
if lang is not None:
|
if lang is not None:
|
||||||
cmd.append("-l")
|
cmd.append("-l")
|
||||||
cmd.append(lang)
|
cmd.append(lang)
|
||||||
@ -139,11 +142,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
|
|
||||||
return df_filtered
|
return df_filtered
|
||||||
|
|
||||||
def _detect_language(self, ifilename: str):
|
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
|
||||||
r"""
|
r"""
|
||||||
Run tesseract in PSM 0 mode to detect the language
|
Run tesseract in PSM 0 mode to detect the language
|
||||||
"""
|
"""
|
||||||
assert self._tesseract_languages is not None
|
|
||||||
|
|
||||||
cmd = [self.options.tesseract_cmd]
|
cmd = [self.options.tesseract_cmd]
|
||||||
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
||||||
@ -154,7 +156,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
df_detected = pd.read_csv(
|
df_detected = pd.read_csv(
|
||||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||||
)
|
)
|
||||||
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
return df_detected
|
||||||
|
|
||||||
|
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
|
||||||
|
assert self._tesseract_languages is not None
|
||||||
|
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
|
||||||
if len(scripts) == 0:
|
if len(scripts) == 0:
|
||||||
_log.warning("Tesseract cannot detect the script of the page")
|
_log.warning("Tesseract cannot detect the script of the page")
|
||||||
return None
|
return None
|
||||||
@ -225,8 +231,14 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
) as image_file:
|
) as image_file:
|
||||||
fname = image_file.name
|
fname = image_file.name
|
||||||
high_res_image.save(image_file)
|
high_res_image.save(image_file)
|
||||||
|
df_osd = self._perform_osd(fname)
|
||||||
df_result = self._run_tesseract(fname)
|
doc_orientation = _parse_orientation(df_osd)
|
||||||
|
if doc_orientation != 0:
|
||||||
|
high_res_image = high_res_image.rotate(
|
||||||
|
doc_orientation, expand=True
|
||||||
|
)
|
||||||
|
high_res_image.save(fname)
|
||||||
|
df_result = self._run_tesseract(fname, df_osd)
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(fname):
|
if os.path.exists(fname):
|
||||||
os.remove(fname)
|
os.remove(fname)
|
||||||
@ -238,13 +250,22 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
text = row["text"]
|
text = row["text"]
|
||||||
conf = row["conf"]
|
conf = row["conf"]
|
||||||
|
|
||||||
l = float(row["left"]) # noqa: E741
|
rotated_bbox = (
|
||||||
b = float(row["top"])
|
row["left"],
|
||||||
w = float(row["width"])
|
row["top"],
|
||||||
h = float(row["height"])
|
row["width"],
|
||||||
|
row["height"],
|
||||||
t = b + h
|
)
|
||||||
r = l + w
|
rotated_bbox = cast(
|
||||||
|
Box, tuple(float(c) for c in rotated_bbox)
|
||||||
|
)
|
||||||
|
rect = tesseract_box_to_bounding_rectangle(
|
||||||
|
rotated_bbox,
|
||||||
|
offset=ocr_rect,
|
||||||
|
scale=self.scale,
|
||||||
|
orientation=doc_orientation,
|
||||||
|
rotated_image_size=high_res_image.size,
|
||||||
|
)
|
||||||
|
|
||||||
cell = TextCell(
|
cell = TextCell(
|
||||||
index=ix,
|
index=ix,
|
||||||
@ -252,17 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
orig=str(text),
|
orig=str(text),
|
||||||
from_ocr=True,
|
from_ocr=True,
|
||||||
confidence=conf / 100.0,
|
confidence=conf / 100.0,
|
||||||
rect=BoundingRectangle.from_bounding_box(
|
rect=rect,
|
||||||
BoundingBox.from_tuple(
|
|
||||||
coord=(
|
|
||||||
(l / self.scale) + ocr_rect.l,
|
|
||||||
(b / self.scale) + ocr_rect.t,
|
|
||||||
(r / self.scale) + ocr_rect.l,
|
|
||||||
(t / self.scale) + ocr_rect.t,
|
|
||||||
),
|
|
||||||
origin=CoordOrigin.TOPLEFT,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
all_ocr_cells.append(cell)
|
all_ocr_cells.append(cell)
|
||||||
|
|
||||||
@ -278,3 +289,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def get_options_type(cls) -> Type[OcrOptions]:
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
return TesseractCliOcrOptions
|
return TesseractCliOcrOptions
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
||||||
|
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
|
||||||
|
orientation = parse_tesseract_orientation(orientations[0].strip())
|
||||||
|
return orientation
|
||||||
|
@ -3,10 +3,9 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Type
|
from typing import Dict, Iterable, Optional, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc.page import TextCell
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.ocr_utils import map_tesseract_script
|
from docling.utils.ocr_utils import (
|
||||||
|
map_tesseract_script,
|
||||||
|
parse_tesseract_orientation,
|
||||||
|
tesseract_box_to_bounding_rectangle,
|
||||||
|
)
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
if lang == "auto":
|
if lang == "auto":
|
||||||
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
||||||
self.osd_reader = tesserocr.PyTessBaseAPI(
|
|
||||||
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
self.reader = tesserocr.PyTessBaseAPI(
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
**{"lang": lang} | tesserocr_kwargs,
|
**{"lang": lang} | tesserocr_kwargs,
|
||||||
)
|
)
|
||||||
|
self.osd_reader = tesserocr.PyTessBaseAPI(
|
||||||
|
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
||||||
|
)
|
||||||
self.reader_RIL = tesserocr.RIL
|
self.reader_RIL = tesserocr.RIL
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
@ -125,6 +128,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "ocr"):
|
with TimeRecorder(conv_res, "ocr"):
|
||||||
assert self.reader is not None
|
assert self.reader is not None
|
||||||
|
assert self.osd_reader is not None
|
||||||
assert self._tesserocr_languages is not None
|
assert self._tesserocr_languages is not None
|
||||||
|
|
||||||
ocr_rects = self.get_ocr_rects(page)
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
@ -139,16 +143,17 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
local_reader = self.reader
|
local_reader = self.reader
|
||||||
if "auto" in self.options.lang:
|
|
||||||
assert self.osd_reader is not None
|
|
||||||
|
|
||||||
self.osd_reader.SetImage(high_res_image)
|
self.osd_reader.SetImage(high_res_image)
|
||||||
osd = self.osd_reader.DetectOrientationScript()
|
osd = self.osd_reader.DetectOrientationScript()
|
||||||
|
|
||||||
# No text, probably
|
# No text, probably
|
||||||
if osd is None:
|
if osd is None:
|
||||||
continue
|
continue
|
||||||
|
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
|
||||||
|
if doc_orientation != 0:
|
||||||
|
high_res_image = high_res_image.rotate(
|
||||||
|
doc_orientation, expand=True
|
||||||
|
)
|
||||||
|
if "auto" in self.options.lang:
|
||||||
script = osd["script_name"]
|
script = osd["script_name"]
|
||||||
script = map_tesseract_script(script)
|
script = map_tesseract_script(script)
|
||||||
lang = f"{self.script_prefix}{script}"
|
lang = f"{self.script_prefix}{script}"
|
||||||
@ -188,11 +193,14 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
# Extract text within the bounding box
|
# Extract text within the bounding box
|
||||||
text = local_reader.GetUTF8Text().strip()
|
text = local_reader.GetUTF8Text().strip()
|
||||||
confidence = local_reader.MeanTextConf()
|
confidence = local_reader.MeanTextConf()
|
||||||
left = box["x"] / self.scale
|
rotated_bbox = (box["x"], box["y"], box["w"], box["h"])
|
||||||
bottom = box["y"] / self.scale
|
rect = tesseract_box_to_bounding_rectangle(
|
||||||
right = (box["x"] + box["w"]) / self.scale
|
rotated_bbox,
|
||||||
top = (box["y"] + box["h"]) / self.scale
|
offset=ocr_rect,
|
||||||
|
scale=self.scale,
|
||||||
|
orientation=doc_orientation,
|
||||||
|
rotated_image_size=high_res_image.size,
|
||||||
|
)
|
||||||
cells.append(
|
cells.append(
|
||||||
TextCell(
|
TextCell(
|
||||||
index=ix,
|
index=ix,
|
||||||
@ -200,12 +208,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
orig=text,
|
orig=text,
|
||||||
from_ocr=True,
|
from_ocr=True,
|
||||||
confidence=confidence,
|
confidence=confidence,
|
||||||
rect=BoundingRectangle.from_bounding_box(
|
rect=rect,
|
||||||
BoundingBox.from_tuple(
|
|
||||||
coord=(left, top, right, bottom),
|
|
||||||
origin=CoordOrigin.TOPLEFT,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1,3 +1,15 @@
|
|||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle
|
||||||
|
|
||||||
|
_TESSERACT_ORIENTATIONS = {0, 90, 180, 270}
|
||||||
|
|
||||||
|
Point = Tuple[float, float]
|
||||||
|
Box = Tuple[float, float, float, float]
|
||||||
|
Size = Tuple[int, int]
|
||||||
|
|
||||||
|
|
||||||
def map_tesseract_script(script: str) -> str:
|
def map_tesseract_script(script: str) -> str:
|
||||||
r""" """
|
r""" """
|
||||||
if script == "Katakana" or script == "Hiragana":
|
if script == "Katakana" or script == "Hiragana":
|
||||||
@ -7,3 +19,75 @@ def map_tesseract_script(script: str) -> str:
|
|||||||
elif script == "Korean":
|
elif script == "Korean":
|
||||||
script = "Hangul"
|
script = "Hangul"
|
||||||
return script
|
return script
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_tesseract_preprocessing_rotation(
|
||||||
|
box: Box, orientation: int, rotated_im_size: Size
|
||||||
|
) -> tuple[Point, Point, Point, Point]:
|
||||||
|
l, t, w, h = box
|
||||||
|
rotated_w, rotated_h = rotated_im_size
|
||||||
|
if orientation == 0:
|
||||||
|
return (l, t), (l + w, t), (l + w, t + h), (l, t + h)
|
||||||
|
if orientation == 90:
|
||||||
|
x0 = rotated_h - t
|
||||||
|
y0 = l
|
||||||
|
return (x0, y0), (x0, y0 + w), (x0 - h, y0 + w), (x0 - h, y0)
|
||||||
|
if orientation == 180:
|
||||||
|
x0 = rotated_w - l
|
||||||
|
y0 = rotated_h - t
|
||||||
|
return (x0, y0), (x0 - w, y0), (x0 - w, y0 - h), (x0, y0 - h)
|
||||||
|
if orientation == 270:
|
||||||
|
x0 = t
|
||||||
|
y0 = rotated_w - l
|
||||||
|
return (x0, y0), (x0, y0 - w), (x0 + h, y0 - w), (x0 + h, y0)
|
||||||
|
msg = (
|
||||||
|
f"invalid tesseract document orientation {orientation}, "
|
||||||
|
f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}"
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tesseract_orientation(orientation: str) -> int:
|
||||||
|
parsed = int(orientation)
|
||||||
|
if parsed not in _TESSERACT_ORIENTATIONS:
|
||||||
|
msg = (
|
||||||
|
f"invalid tesseract document orientation {orientation}, "
|
||||||
|
f"expected orientation: {sorted(_TESSERACT_ORIENTATIONS)}"
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
|
||||||
|
def tesseract_box_to_bounding_rectangle(
|
||||||
|
box: Box,
|
||||||
|
*,
|
||||||
|
offset: Optional[BoundingBox] = None,
|
||||||
|
scale: float,
|
||||||
|
orientation: int,
|
||||||
|
rotated_image_size: Size,
|
||||||
|
) -> BoundingRectangle:
|
||||||
|
# box is in the top, left, height, width format + top left orientation
|
||||||
|
r_0, r_1, r_2, r_3 = reverse_tesseract_preprocessing_rotation(
|
||||||
|
box, orientation, rotated_image_size
|
||||||
|
)
|
||||||
|
rect = BoundingRectangle(
|
||||||
|
r_x0=r_0[0] / scale,
|
||||||
|
r_y0=r_0[1] / scale,
|
||||||
|
r_x1=r_1[0] / scale,
|
||||||
|
r_y1=r_1[1] / scale,
|
||||||
|
r_x2=r_2[0] / scale,
|
||||||
|
r_y2=r_2[1] / scale,
|
||||||
|
r_x3=r_3[0] / scale,
|
||||||
|
r_y3=r_3[1] / scale,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
if offset is not None:
|
||||||
|
rect.r_x0 += offset.l
|
||||||
|
rect.r_x1 += offset.l
|
||||||
|
rect.r_x2 += offset.l
|
||||||
|
rect.r_x3 += offset.l
|
||||||
|
rect.r_y0 += offset.t
|
||||||
|
rect.r_y1 += offset.t
|
||||||
|
rect.r_y2 += offset.t
|
||||||
|
rect.r_y3 += offset.t
|
||||||
|
return rect
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
<document>
|
||||||
|
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
|
||||||
|
</document>
|
@ -0,0 +1 @@
|
|||||||
|
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -0,0 +1 @@
|
|||||||
|
package
|
@ -0,0 +1 @@
|
|||||||
|
[{"page_no": 0, "size": {"width": 841.9216918945312, "height": 595.201171875}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null, "vlm_response": null}, "assembled": {"elements": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}, {"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "body": [{"label": "text", "id": 1, "page_no": 0, "cluster": {"id": 1, "label": "text", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}, "confidence": 0.5234212875366211, "cells": [{"id": 2, "text": "package", "bbox": {"l": 131.21306574279092, "t": 441.0071698212682, "r": 152.19606490864376, "b": 521.0762158417759, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "package"}], "headers": [{"label": "page_header", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "page_header", "bbox": {"l": 77.10171546422428, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}, "confidence": 0.6016772389411926, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 77.10171546422428, "t": 89.23887398109309, "r": 96.6831586150625, "b": 520.7638577050515, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 100.55299576256091, "t": 89.12381765643227, "r": 124.91101654503161, "b": 523.3155494272656, "coord_origin": "TOPLEFT"}}], "children": []}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained"}]}}]
|
@ -0,0 +1,4 @@
|
|||||||
|
<document>
|
||||||
|
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
|
||||||
|
</document>
|
@ -0,0 +1 @@
|
|||||||
|
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 523.0, 123.25502522786462], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -0,0 +1,3 @@
|
|||||||
|
package
|
||||||
|
|
||||||
|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
|||||||
|
<document>
|
||||||
|
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
|
||||||
|
</document>
|
@ -0,0 +1 @@
|
|||||||
|
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_270.pdf", "filename-prov": null, "document-hash": "52f54e7183bdb73aa3713c7b169baca93e276963a138418c26e7d6a1ea128f14", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "59bc9ddba89e7b008185dd16d384493beb034686e5670546786390c5d237a304", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [691.6666666666666, 444.53450520833337, 710.3333333333334, 521.5345052083334], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -0,0 +1 @@
|
|||||||
|
package
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
|||||||
|
<document>
|
||||||
|
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
|
||||||
|
</document>
|
@ -0,0 +1 @@
|
|||||||
|
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_90.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.66666666666666, 73.53450520833337, 150.33333333333334, 150.53450520833331], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -0,0 +1 @@
|
|||||||
|
package
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
|||||||
|
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
|
||||||
|
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||||
|
</doctag>
|
@ -0,0 +1 @@
|
|||||||
|
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
@ -0,0 +1,3 @@
|
|||||||
|
package
|
||||||
|
|
||||||
|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
|||||||
|
<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||||
|
<text><loc_411><loc_62><loc_422><loc_127>package</text>
|
||||||
|
</doctag>
|
@ -0,0 +1 @@
|
|||||||
|
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_270", "origin": {"mimetype": "application/pdf", "binary_hash": 10890858393843077593, "filename": "ocr_test_rotated_270.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 718.6666666666666, "t": 522.8678385416666, "r": 764.0, "b": 91.86783854166669, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 691.6666666666666, "t": 521.5345052083334, "r": 710.3333333333334, "b": 444.53450520833337, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}}
|
@ -0,0 +1 @@
|
|||||||
|
package
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
|||||||
|
<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||||
|
<text><loc_78><loc_374><loc_89><loc_438>package</text>
|
||||||
|
</doctag>
|
@ -0,0 +1 @@
|
|||||||
|
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_90", "origin": {"mimetype": "application/pdf", "binary_hash": 6989291015361162334, "filename": "ocr_test_rotated_90.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [{"page_no": 1, "bbox": {"l": 78.0, "t": 503.201171875, "r": 123.33333333333333, "b": 72.201171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 131.66666666666666, "t": 150.53450520833331, "r": 150.33333333333334, "b": 73.53450520833337, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 841.9216918945312, "height": 595.201171875}, "image": null, "page_no": 1}}}
|
@ -0,0 +1 @@
|
|||||||
|
package
|
File diff suppressed because one or more lines are too long
BIN
tests/data_scanned/ocr_test_rotated_180.pdf
Normal file
BIN
tests/data_scanned/ocr_test_rotated_180.pdf
Normal file
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_270.pdf
Normal file
BIN
tests/data_scanned/ocr_test_rotated_270.pdf
Normal file
Binary file not shown.
BIN
tests/data_scanned/ocr_test_rotated_90.pdf
Normal file
BIN
tests/data_scanned/ocr_test_rotated_90.pdf
Normal file
Binary file not shown.
@ -1,6 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List, Tuple
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -56,33 +56,35 @@ def get_converter(ocr_options: OcrOptions):
|
|||||||
def test_e2e_conversions():
|
def test_e2e_conversions():
|
||||||
pdf_paths = get_pdf_paths()
|
pdf_paths = get_pdf_paths()
|
||||||
|
|
||||||
engines: List[OcrOptions] = [
|
engines: List[Tuple[OcrOptions, bool]] = [
|
||||||
EasyOcrOptions(),
|
(EasyOcrOptions(), False),
|
||||||
TesseractOcrOptions(),
|
(TesseractOcrOptions(), True),
|
||||||
TesseractCliOcrOptions(),
|
(TesseractCliOcrOptions(), True),
|
||||||
EasyOcrOptions(force_full_page_ocr=True),
|
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||||
TesseractOcrOptions(force_full_page_ocr=True),
|
(TesseractOcrOptions(force_full_page_ocr=True), True),
|
||||||
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||||
TesseractCliOcrOptions(force_full_page_ocr=True),
|
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
|
||||||
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||||
]
|
]
|
||||||
|
|
||||||
# rapidocr is only available for Python >=3.6,<3.13
|
# rapidocr is only available for Python >=3.6,<3.13
|
||||||
if sys.version_info < (3, 13):
|
if sys.version_info < (3, 13):
|
||||||
engines.append(RapidOcrOptions())
|
engines.append((RapidOcrOptions(), False))
|
||||||
engines.append(RapidOcrOptions(force_full_page_ocr=True))
|
engines.append((RapidOcrOptions(force_full_page_ocr=True), False))
|
||||||
|
|
||||||
# only works on mac
|
# only works on mac
|
||||||
if "darwin" == sys.platform:
|
if "darwin" == sys.platform:
|
||||||
engines.append(OcrMacOptions())
|
engines.append((OcrMacOptions(), True))
|
||||||
engines.append(OcrMacOptions(force_full_page_ocr=True))
|
engines.append((OcrMacOptions(force_full_page_ocr=True), True))
|
||||||
|
|
||||||
for ocr_options in engines:
|
for ocr_options, supports_rotation in engines:
|
||||||
print(
|
print(
|
||||||
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
|
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
|
||||||
)
|
)
|
||||||
converter = get_converter(ocr_options=ocr_options)
|
converter = get_converter(ocr_options=ocr_options)
|
||||||
for pdf_path in pdf_paths:
|
for pdf_path in pdf_paths:
|
||||||
|
if not supports_rotation and "rotated" in pdf_path.name:
|
||||||
|
continue
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
doc_result: ConversionResult = converter.convert(pdf_path)
|
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||||
|
Loading…
Reference in New Issue
Block a user