mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
fix(ocr): refactor rotation utilities
Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
parent
0b39bb58bf
commit
fdc6a01bc8
@ -6,10 +6,11 @@ import tempfile
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from subprocess import DEVNULL, PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import List, Optional, Tuple, Type, cast
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -25,7 +26,6 @@ from docling.utils.ocr_utils import (
|
|||||||
parse_tesseract_orientation,
|
parse_tesseract_orientation,
|
||||||
tesseract_box_to_bounding_rectangle,
|
tesseract_box_to_bounding_rectangle,
|
||||||
)
|
)
|
||||||
from docling.utils.orientation import Box
|
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -235,7 +235,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
doc_orientation = _parse_orientation(df_osd)
|
doc_orientation = _parse_orientation(df_osd)
|
||||||
if doc_orientation != 0:
|
if doc_orientation != 0:
|
||||||
high_res_image = high_res_image.rotate(
|
high_res_image = high_res_image.rotate(
|
||||||
doc_orientation, expand=True
|
-doc_orientation, expand=True
|
||||||
)
|
)
|
||||||
high_res_image.save(fname)
|
high_res_image.save(fname)
|
||||||
df_result = self._run_tesseract(fname, df_osd)
|
df_result = self._run_tesseract(fname, df_osd)
|
||||||
@ -250,21 +250,18 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
text = row["text"]
|
text = row["text"]
|
||||||
conf = row["conf"]
|
conf = row["conf"]
|
||||||
|
|
||||||
rotated_bbox = (
|
l, t = float(row["left"]), float(row["top"])
|
||||||
row["left"],
|
r = l + float(row["width"])
|
||||||
row["top"],
|
b = t + row["height"]
|
||||||
row["width"],
|
bbox = BoundingBox(
|
||||||
row["height"],
|
l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
|
||||||
)
|
|
||||||
rotated_bbox = cast(
|
|
||||||
Box, tuple(float(c) for c in rotated_bbox)
|
|
||||||
)
|
)
|
||||||
rect = tesseract_box_to_bounding_rectangle(
|
rect = tesseract_box_to_bounding_rectangle(
|
||||||
rotated_bbox,
|
bbox,
|
||||||
offset=ocr_rect,
|
original_offset=ocr_rect,
|
||||||
scale=self.scale,
|
scale=self.scale,
|
||||||
orientation=doc_orientation,
|
orientation=doc_orientation,
|
||||||
rotated_image_size=high_res_image.size,
|
im_size=high_res_image.size,
|
||||||
)
|
)
|
||||||
cell = TextCell(
|
cell = TextCell(
|
||||||
index=ix,
|
index=ix,
|
||||||
|
@ -5,6 +5,7 @@ from collections.abc import Iterable
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterable, Optional, Type
|
from typing import Dict, Iterable, Optional, Type
|
||||||
|
|
||||||
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import TextCell
|
from docling_core.types.doc.page import TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
@ -151,7 +152,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
|
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
|
||||||
if doc_orientation != 0:
|
if doc_orientation != 0:
|
||||||
high_res_image = high_res_image.rotate(
|
high_res_image = high_res_image.rotate(
|
||||||
doc_orientation, expand=True
|
-doc_orientation, expand=True
|
||||||
)
|
)
|
||||||
if "auto" in self.options.lang:
|
if "auto" in self.options.lang:
|
||||||
script = osd["script_name"]
|
script = osd["script_name"]
|
||||||
@ -193,13 +194,18 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
# Extract text within the bounding box
|
# Extract text within the bounding box
|
||||||
text = local_reader.GetUTF8Text().strip()
|
text = local_reader.GetUTF8Text().strip()
|
||||||
confidence = local_reader.MeanTextConf()
|
confidence = local_reader.MeanTextConf()
|
||||||
rotated_bbox = (box["x"], box["y"], box["w"], box["h"])
|
l, t = box["x"], box["y"]
|
||||||
|
r = l + box["w"]
|
||||||
|
b = t + box["h"]
|
||||||
|
bbox = BoundingBox(
|
||||||
|
l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT
|
||||||
|
)
|
||||||
rect = tesseract_box_to_bounding_rectangle(
|
rect = tesseract_box_to_bounding_rectangle(
|
||||||
rotated_bbox,
|
bbox,
|
||||||
offset=ocr_rect,
|
original_offset=ocr_rect,
|
||||||
scale=self.scale,
|
scale=self.scale,
|
||||||
orientation=doc_orientation,
|
orientation=doc_orientation,
|
||||||
rotated_image_size=high_res_image.size,
|
im_size=high_res_image.size,
|
||||||
)
|
)
|
||||||
cells.append(
|
cells.append(
|
||||||
TextCell(
|
TextCell(
|
||||||
|
@ -1,14 +1,9 @@
|
|||||||
from typing import Optional
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle
|
from docling_core.types.doc.page import BoundingRectangle
|
||||||
|
|
||||||
from docling.utils.orientation import (
|
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
||||||
Box,
|
|
||||||
Size,
|
|
||||||
CLIPPED_ORIENTATIONS,
|
|
||||||
rotate_ltwh_bounding_box,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def map_tesseract_script(script: str) -> str:
|
def map_tesseract_script(script: str) -> str:
|
||||||
@ -38,33 +33,37 @@ def parse_tesseract_orientation(orientation: str) -> int:
|
|||||||
|
|
||||||
|
|
||||||
def tesseract_box_to_bounding_rectangle(
|
def tesseract_box_to_bounding_rectangle(
|
||||||
box: Box,
|
bbox: BoundingBox,
|
||||||
*,
|
*,
|
||||||
offset: Optional[BoundingBox] = None,
|
original_offset: Optional[BoundingBox] = None,
|
||||||
scale: float,
|
scale: float,
|
||||||
orientation: int,
|
orientation: int,
|
||||||
rotated_image_size: Size,
|
im_size: Tuple[int, int],
|
||||||
) -> BoundingRectangle:
|
) -> BoundingRectangle:
|
||||||
# box is in the top, left, height, width format + top left orientation
|
# box is in the top, left, height, width format, top left coordinates
|
||||||
r_0, r_1, r_2, r_3 = rotate_ltwh_bounding_box(box, orientation, rotated_image_size)
|
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
|
||||||
rect = BoundingRectangle(
|
rect = BoundingRectangle(
|
||||||
r_x0=r_0[0] / scale,
|
r_x0=rect.r_x0 / scale,
|
||||||
r_y0=r_0[1] / scale,
|
r_y0=rect.r_y0 / scale,
|
||||||
r_x1=r_1[0] / scale,
|
r_x1=rect.r_x1 / scale,
|
||||||
r_y1=r_1[1] / scale,
|
r_y1=rect.r_y1 / scale,
|
||||||
r_x2=r_2[0] / scale,
|
r_x2=rect.r_x2 / scale,
|
||||||
r_y2=r_2[1] / scale,
|
r_y2=rect.r_y2 / scale,
|
||||||
r_x3=r_3[0] / scale,
|
r_x3=rect.r_x3 / scale,
|
||||||
r_y3=r_3[1] / scale,
|
r_y3=rect.r_y3 / scale,
|
||||||
coord_origin=CoordOrigin.TOPLEFT,
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
)
|
)
|
||||||
if offset is not None:
|
if original_offset is not None:
|
||||||
rect.r_x0 += offset.l
|
if not original_offset.coord_origin is CoordOrigin.TOPLEFT:
|
||||||
rect.r_x1 += offset.l
|
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
|
||||||
rect.r_x2 += offset.l
|
raise ValueError(msg)
|
||||||
rect.r_x3 += offset.l
|
if original_offset is not None:
|
||||||
rect.r_y0 += offset.t
|
rect.r_x0 += original_offset.l
|
||||||
rect.r_y1 += offset.t
|
rect.r_x1 += original_offset.l
|
||||||
rect.r_y2 += offset.t
|
rect.r_x2 += original_offset.l
|
||||||
rect.r_y3 += offset.t
|
rect.r_x3 += original_offset.l
|
||||||
|
rect.r_y0 += original_offset.t
|
||||||
|
rect.r_y1 += original_offset.t
|
||||||
|
rect.r_y2 += original_offset.t
|
||||||
|
rect.r_y3 += original_offset.t
|
||||||
return rect
|
return rect
|
||||||
|
@ -2,12 +2,8 @@ from collections import Counter
|
|||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from docling_core.types.doc.page import TextCell
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
Point = Tuple[float, float]
|
|
||||||
Box = Tuple[float, float, float, float]
|
|
||||||
Size = Tuple[int, int]
|
|
||||||
|
|
||||||
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
||||||
|
|
||||||
@ -23,32 +19,66 @@ def detect_orientation(cells: list[TextCell]) -> int:
|
|||||||
return max(orientation_counter.items(), key=itemgetter(1))[0]
|
return max(orientation_counter.items(), key=itemgetter(1))[0]
|
||||||
|
|
||||||
|
|
||||||
def rotate_ltwh_bounding_box(
|
def rotate_bounding_box(
|
||||||
box: Box, orientation: int, rotated_im_size: Size
|
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
||||||
) -> tuple[Point, Point, Point, Point]:
|
) -> BoundingRectangle:
|
||||||
# The box is left top width height in TOPLEFT coordinates
|
# The box is left top width height in TOPLEFT coordinates
|
||||||
# Bounding rectangle start with r_0 at the bottom left whatever the
|
# Bounding rectangle start with r_0 at the bottom left whatever the
|
||||||
# coordinate system. Then other corners are found rotating counterclockwise
|
# coordinate system. Then other corners are found rotating counterclockwise
|
||||||
l, t, w, h = box
|
bbox = bbox.to_top_left_origin(im_size[1])
|
||||||
rotated_im_w, rotated_im_h = rotated_im_size
|
l, t, w, h = bbox.l, bbox.t, bbox.width, bbox.height
|
||||||
if orientation == 0:
|
im_h, im_w = im_size
|
||||||
r0_x = l
|
angle = angle % 360
|
||||||
r0_y = t + h
|
if angle == 0:
|
||||||
return (r0_x, r0_y), (r0_x + w, r0_y), (r0_x + w, r0_y - h), (r0_x, r0_y - h)
|
r_x0 = l
|
||||||
if orientation == 90:
|
r_y0 = t + h
|
||||||
r0_x = t + h
|
r_x1 = r_x0 + w
|
||||||
r0_y = rotated_im_w - l
|
r_y1 = r_y0
|
||||||
return (r0_x, r0_y), (r0_x, r0_y - w), (r0_x - h, r0_y - w), (r0_x - h, r0_y)
|
r_x2 = r_x0 + w
|
||||||
if orientation == 180:
|
r_y2 = r_y0 - h
|
||||||
r0_x = rotated_im_w - l
|
r_x3 = r_x0
|
||||||
r0_y = rotated_im_h - (t + h)
|
r_y3 = r_y0 - h
|
||||||
return (r0_x, r0_y), (r0_x - w, r0_y), (r0_x - w, r0_y + h), (r0_x, r0_y + h)
|
elif angle == 90:
|
||||||
if orientation == 270:
|
r_x0 = im_w - (t + h)
|
||||||
r0_x = rotated_im_h - (t + h)
|
r_y0 = l
|
||||||
r0_y = l
|
r_x1 = r_x0
|
||||||
return (r0_x, r0_y), (r0_x, r0_y + w), (r0_x + h, r0_y + w), (r0_x, r0_y + w)
|
r_y1 = r_y0 + w
|
||||||
msg = (
|
r_x2 = r_x0 + h
|
||||||
f"orientation {orientation}, expected values in:"
|
r_y2 = r_y0 + w
|
||||||
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
r_x3 = r_x0
|
||||||
|
r_y3 = r_y0 + w
|
||||||
|
elif angle == 180:
|
||||||
|
r_x0 = im_h - l
|
||||||
|
r_y0 = im_w - (t + h)
|
||||||
|
r_x1 = r_x0 - w
|
||||||
|
r_y1 = r_y0
|
||||||
|
r_x2 = r_x0 - w
|
||||||
|
r_y2 = r_y0 + h
|
||||||
|
r_x3 = r_x0
|
||||||
|
r_y3 = r_y0 + h
|
||||||
|
elif angle == 270:
|
||||||
|
r_x0 = t + h
|
||||||
|
r_y0 = im_h - l
|
||||||
|
r_x1 = r_x0
|
||||||
|
r_y1 = r_y0 - w
|
||||||
|
r_x2 = r_x0 - h
|
||||||
|
r_y2 = r_y0 - w
|
||||||
|
r_x3 = r_x0 - h
|
||||||
|
r_y3 = r_y0
|
||||||
|
else:
|
||||||
|
msg = (
|
||||||
|
f"invalid orientation {angle}, expected values in:"
|
||||||
|
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
return BoundingRectangle(
|
||||||
|
r_x0=r_x0,
|
||||||
|
r_y0=r_y0,
|
||||||
|
r_x1=r_x1,
|
||||||
|
r_y1=r_y1,
|
||||||
|
r_x2=r_x2,
|
||||||
|
r_y2=r_y2,
|
||||||
|
r_x3=r_x3,
|
||||||
|
r_y3=r_y3,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
)
|
)
|
||||||
raise ValueError(msg)
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
<document>
|
<document>
|
||||||
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
|
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||||
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
|
<paragraph><location><page_1><loc_15><loc_12><loc_88><loc_15></location>JSON and Markdown in an easy self contained</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_11></location>Docling bundles PDF document conversion to</paragraph>
|
||||||
</document>
|
</document>
|
@ -1 +1 @@
|
|||||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 523.0, 123.25502522786462], "page": 1, "span": [0, 86], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated_180.pdf", "filename-prov": null, "document-hash": "a9cbfe0f2a71171face9ee31d2347ca4195649670ad75680520d67d4a863f982", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "baca27070f05dd84cf0903ded39bcf0fc1fa6ef0ac390e79cf8ba90c8c33ba49", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [444.6666666666667, 131.58835856119788, 521.6666666666666, 150.25502522786462], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 104.58835856119788, 523.0, 123.25502522786462], "page": 1, "span": [0, 43], "__ref_s3_data": null}], "text": "JSON and Markdown in an easy self contained", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.0, 77.92169189453125, 521.3333333333334, 96.58835856119788], "page": 1, "span": [0, 42], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -1,3 +1,5 @@
|
|||||||
package
|
package
|
||||||
|
|
||||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
JSON and Markdown in an easy self contained
|
||||||
|
|
||||||
|
Docling bundles PDF document conversion to
|
File diff suppressed because one or more lines are too long
@ -1,3 +1,4 @@
|
|||||||
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
|
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
|
||||||
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
<text><loc_77><loc_427><loc_439><loc_438>JSON and Markdown in an easy self contained</text>
|
||||||
|
<text><loc_77><loc_443><loc_438><loc_454>Docling bundles PDF document conversion to</text>
|
||||||
</doctag>
|
</doctag>
|
@ -1 +1 @@
|
|||||||
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 86]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test_rotated_180", "origin": {"mimetype": "application/pdf", "binary_hash": 2530576989861832966, "filename": "ocr_test_rotated_180.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}, {"cref": "#/texts/1"}, {"cref": "#/texts/2"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 444.6666666666667, "t": 150.25502522786462, "r": 521.6666666666666, "b": 131.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 7]}], "orig": "package", "text": "package", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/1", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 123.25502522786462, "r": 523.0, "b": 104.58835856119788, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 43]}], "orig": "JSON and Markdown in an easy self contained", "text": "JSON and Markdown in an easy self contained", "formatting": null, "hyperlink": null}, {"self_ref": "#/texts/2", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 92.0, "t": 96.58835856119788, "r": 521.3333333333334, "b": 77.92169189453125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 42]}], "orig": "Docling bundles PDF document conversion to", "text": "Docling bundles PDF document conversion to", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
@ -1,3 +1,5 @@
|
|||||||
package
|
package
|
||||||
|
|
||||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained
|
JSON and Markdown in an easy self contained
|
||||||
|
|
||||||
|
Docling bundles PDF document conversion to
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user