mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Merge remote-tracking branch 'origin/main' into dev/add-other-vlm-models
This commit is contained in:
commit
738385004a
@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
else:
|
else:
|
||||||
return "application/xml"
|
return "application/xml"
|
||||||
|
|
||||||
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
if re.match(
|
||||||
|
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
||||||
|
content_str,
|
||||||
|
re.DOTALL,
|
||||||
|
):
|
||||||
return "text/html"
|
return "text/html"
|
||||||
|
|
||||||
p = re.compile(
|
p = re.compile(
|
||||||
|
2
docs/index.md
vendored
2
docs/index.md
vendored
@ -39,7 +39,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|||||||
## Get started
|
## Get started
|
||||||
|
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
<a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamendals</a>
|
<a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamentals</a>
|
||||||
<a href="examples/" class="card"><b>Examples</b><br />Try out recipes for various use cases, including conversion, RAG, and more</a>
|
<a href="examples/" class="card"><b>Examples</b><br />Try out recipes for various use cases, including conversion, RAG, and more</a>
|
||||||
<a href="integrations/" class="card"><b>Integrations</b><br />Check out integrations with popular frameworks and tools</a>
|
<a href="integrations/" class="card"><b>Integrations</b><br />Check out integrations with popular frameworks and tools</a>
|
||||||
<a href="reference/document_converter/" class="card"><b>Reference</b><br />See more API details</a>
|
<a href="reference/document_converter/" class="card"><b>Reference</b><br />See more API details</a>
|
||||||
|
@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
|
|||||||
doc_path = Path("./tests/data/html/wiki_duck.html")
|
doc_path = Path("./tests/data/html/wiki_duck.html")
|
||||||
assert dci._guess_format(doc_path) == InputFormat.HTML
|
assert dci._guess_format(doc_path) == InputFormat.HTML
|
||||||
|
|
||||||
|
html_str = ( # HTML starting with a script
|
||||||
|
"<script>\nconsole.log('foo');\n</script>"
|
||||||
|
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
|
||||||
|
)
|
||||||
|
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
|
||||||
|
assert dci._guess_format(stream) == InputFormat.HTML
|
||||||
|
|
||||||
# Valid MD
|
# Valid MD
|
||||||
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
||||||
stream = DocumentStream(name="wiki.md", stream=buf)
|
stream = DocumentStream(name="wiki.md", stream=buf)
|
||||||
|
@ -323,33 +323,33 @@ def verify_conversion_result_v1(
|
|||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(
|
fw.write(
|
||||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||||
)
|
)
|
||||||
|
|
||||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(json_path, "w") as fw:
|
with open(json_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||||
|
|
||||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(md_path, "w") as fw:
|
with open(md_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_md)
|
fw.write(doc_pred_md)
|
||||||
|
|
||||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
with open(pages_path) as fr:
|
with open(pages_path, encoding="utf-8") as fr:
|
||||||
doc_true_pages = PageList.validate_json(fr.read())
|
doc_true_pages = PageList.validate_json(fr.read())
|
||||||
|
|
||||||
with open(json_path) as fr:
|
with open(json_path, encoding="utf-8") as fr:
|
||||||
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
with open(md_path) as fr:
|
with open(md_path, encoding="utf-8") as fr:
|
||||||
doc_true_md = fr.read()
|
doc_true_md = fr.read()
|
||||||
|
|
||||||
with open(dt_path) as fr:
|
with open(dt_path, encoding="utf-8") as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
if not fuzzy:
|
if not fuzzy:
|
||||||
@ -408,33 +408,33 @@ def verify_conversion_result_v2(
|
|||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(
|
fw.write(
|
||||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||||
)
|
)
|
||||||
|
|
||||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(json_path, "w") as fw:
|
with open(json_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||||
|
|
||||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(md_path, "w") as fw:
|
with open(md_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_md)
|
fw.write(doc_pred_md)
|
||||||
|
|
||||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
with open(pages_path) as fr:
|
with open(pages_path, encoding="utf-8") as fr:
|
||||||
doc_true_pages = PageList.validate_json(fr.read())
|
doc_true_pages = PageList.validate_json(fr.read())
|
||||||
|
|
||||||
with open(json_path) as fr:
|
with open(json_path, encoding="utf-8") as fr:
|
||||||
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
with open(md_path) as fr:
|
with open(md_path, encoding="utf-8") as fr:
|
||||||
doc_true_md = fr.read()
|
doc_true_md = fr.read()
|
||||||
|
|
||||||
with open(dt_path) as fr:
|
with open(dt_path, encoding="utf-8") as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
if not fuzzy:
|
if not fuzzy:
|
||||||
@ -461,12 +461,12 @@ def verify_conversion_result_v2(
|
|||||||
|
|
||||||
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
||||||
if not os.path.exists(gtfile) or generate:
|
if not os.path.exists(gtfile) or generate:
|
||||||
with open(gtfile, "w") as fw:
|
with open(gtfile, mode="w", encoding="utf-8") as fw:
|
||||||
json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
|
json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
with open(gtfile) as fr:
|
with open(gtfile, encoding="utf-8") as fr:
|
||||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||||
@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool:
|
|||||||
file = Path(gtfile)
|
file = Path(gtfile)
|
||||||
|
|
||||||
if not file.exists() or generate:
|
if not file.exists() or generate:
|
||||||
with file.open("w") as fw:
|
with file.open(mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(pred_text)
|
fw.write(pred_text)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
with file.open("r") as fr:
|
with file.open(encoding="utf-8") as fr:
|
||||||
true_text = fr.read()
|
true_text = fr.read()
|
||||||
|
|
||||||
return pred_text == true_text
|
return pred_text == true_text
|
||||||
|
Loading…
Reference in New Issue
Block a user