feat: add function to remove content from DocTags

Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
Yusik Kim 2025-03-19 18:22:15 +01:00
parent 7df157204b
commit b52b5672c9
3 changed files with 687 additions and 67 deletions

View File

@ -0,0 +1,17 @@
from docling_core.experimental.serializer.doctags import (
DocTagsDocSerializer,
DocTagsParams,
)
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from PIL import Image as PILImage
def remove_doctags_content(doctags: str, image: PILImage.Image) -> str:
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
doc = DoclingDocument(name="dummy")
doc.load_from_doctags(doctags_doc)
dt_params = DocTagsParams(add_content=False)
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
pages = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
return ser.serialize_doc(pages=pages).text

734
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -46,7 +46,8 @@ packages = [{ include = "docling" }]
###################### ######################
python = "^3.9" python = "^3.9"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = {extras = ["chunking"], version = "^2.23.1"} docling-core = { git = "git@github.com:docling-project/docling-core.git", extras = ["chunking"], branch = "add-doctags-serializer"}
#docling-core = { extras = ["chunking"], version = "^2.23.1" }
docling-ibm-models = "^3.4.0" docling-ibm-models = "^3.4.0"
docling-parse = "^4.0.0" docling-parse = "^4.0.0"
filetype = "^1.2.0" filetype = "^1.2.0"