mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
feat: add function to remove content from DocTags
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
7df157204b
commit
b52b5672c9
17
docling/utils/doctags_utils.py
Normal file
17
docling/utils/doctags_utils.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from docling_core.experimental.serializer.doctags import (
|
||||||
|
DocTagsDocSerializer,
|
||||||
|
DocTagsParams,
|
||||||
|
)
|
||||||
|
from docling_core.types.doc import DoclingDocument
|
||||||
|
from docling_core.types.doc.document import DocTagsDocument
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
|
||||||
|
def remove_doctags_content(doctags: str, image: PILImage.Image) -> str:
|
||||||
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
||||||
|
doc = DoclingDocument(name="dummy")
|
||||||
|
doc.load_from_doctags(doctags_doc)
|
||||||
|
dt_params = DocTagsParams(add_content=False)
|
||||||
|
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
||||||
|
pages = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
|
||||||
|
return ser.serialize_doc(pages=pages).text
|
734
poetry.lock
generated
734
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -46,7 +46,8 @@ packages = [{ include = "docling" }]
|
|||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = {extras = ["chunking"], version = "^2.23.1"}
|
docling-core = { git = "git@github.com:docling-project/docling-core.git", extras = ["chunking"], branch = "add-doctags-serializer"}
|
||||||
|
#docling-core = { extras = ["chunking"], version = "^2.23.1" }
|
||||||
docling-ibm-models = "^3.4.0"
|
docling-ibm-models = "^3.4.0"
|
||||||
docling-parse = "^4.0.0"
|
docling-parse = "^4.0.0"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
|
Loading…
Reference in New Issue
Block a user