mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
fix: add pages to DoclingDoc
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
b52b5672c9
commit
50d2ef1ad6
@ -1,16 +1,49 @@
|
|||||||
|
import base64
|
||||||
|
import io
|
||||||
|
|
||||||
|
from PIL import Image as PILImage
|
||||||
from docling_core.experimental.serializer.doctags import (
|
from docling_core.experimental.serializer.doctags import (
|
||||||
DocTagsDocSerializer,
|
DocTagsDocSerializer,
|
||||||
DocTagsParams,
|
DocTagsParams,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc import DoclingDocument
|
from docling_core.types.doc import DoclingDocument, Size
|
||||||
from docling_core.types.doc.document import DocTagsDocument
|
from docling_core.types.doc.document import DocTagsDocument, ImageRef, PageItem
|
||||||
from PIL import Image as PILImage
|
from pydantic import AnyUrl
|
||||||
|
|
||||||
|
|
||||||
def remove_doctags_content(doctags: str, image: PILImage.Image) -> str:
|
def remove_doctags_content(doctags: str, image: PILImage.Image) -> str:
|
||||||
|
def from_pil_to_base64(img: PILImage.Image) -> str:
|
||||||
|
# Convert the image to a base64 str
|
||||||
|
buffered = io.BytesIO()
|
||||||
|
img.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.)
|
||||||
|
image_bytes = buffered.getvalue()
|
||||||
|
|
||||||
|
# Encode the bytes to a Base64 string
|
||||||
|
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
|
||||||
|
return image_base64
|
||||||
|
|
||||||
|
def from_pil_to_base64uri(img: PILImage.Image) -> AnyUrl:
|
||||||
|
image_base64 = from_pil_to_base64(img)
|
||||||
|
uri = AnyUrl(f"data:image/png;base64,{image_base64}")
|
||||||
|
|
||||||
|
return uri
|
||||||
|
|
||||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
||||||
doc = DoclingDocument(name="dummy")
|
doc = DoclingDocument(name="dummy")
|
||||||
doc.load_from_doctags(doctags_doc)
|
doc.load_from_doctags(doctags_doc)
|
||||||
|
image_ref = ImageRef(
|
||||||
|
mimetype="image/png",
|
||||||
|
dpi=72,
|
||||||
|
size=Size(width=float(image.width), height=float(image.height)),
|
||||||
|
uri=from_pil_to_base64uri(image),
|
||||||
|
)
|
||||||
|
page_item = PageItem(
|
||||||
|
page_no=1,
|
||||||
|
size=Size(width=float(image.width), height=float(image.height)),
|
||||||
|
image=image_ref,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc.pages[1] = page_item
|
||||||
dt_params = DocTagsParams(add_content=False)
|
dt_params = DocTagsParams(add_content=False)
|
||||||
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
||||||
pages = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
|
pages = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
|
||||||
|
BIN
tests/data_scanned/ocr_test.png
Normal file
BIN
tests/data_scanned/ocr_test.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 57 KiB |
11
tests/test_doctags_utils.py
Normal file
11
tests/test_doctags_utils.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
from docling.utils.doctags_utils import remove_doctags_content
|
||||||
|
|
||||||
|
def test_remove_doctags_content():
|
||||||
|
img = PILImage.open("./tests/data_scanned/ocr_test.png")
|
||||||
|
with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f:
|
||||||
|
doctags = f.read()
|
||||||
|
actual = remove_doctags_content(doctags, img)
|
||||||
|
expected = "<doctag><text><loc_58><loc_44><loc_426><loc_91></text>\n</doctag>"
|
||||||
|
assert actual == expected
|
Loading…
Reference in New Issue
Block a user