diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 5791c0e4..4c71f5c8 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel): else: return "application/xml" - if re.match(r".*?\s*)?( - Concepts
Learn Docling fundamendals
+ Concepts
Learn Docling fundamentals
Examples
Try out recipes for various use cases, including conversion, RAG, and more
Integrations
Check out integrations with popular frameworks and tools
Reference
See more API details
diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 94a68873..d5e40f0f 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -132,6 +132,13 @@ def test_guess_format(tmp_path): doc_path = Path("./tests/data/html/wiki_duck.html") assert dci._guess_format(doc_path) == InputFormat.HTML + html_str = ( # HTML starting with a script + "" + '\n' + ) + stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode())) + assert dci._guess_format(stream) == InputFormat.HTML + # Valid MD buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read()) stream = DocumentStream(name="wiki.md", stream=buf) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 0db53502..96cf9c3c 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -323,33 +323,33 @@ def verify_conversion_result_v1( if generate: # only used when re-generating truth pages_path.parent.mkdir(parents=True, exist_ok=True) - with open(pages_path, "w") as fw: + with open(pages_path, mode="w", encoding="utf-8") as fw: fw.write( json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent) ) json_path.parent.mkdir(parents=True, exist_ok=True) - with open(json_path, "w") as fw: + with open(json_path, mode="w", encoding="utf-8") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent)) md_path.parent.mkdir(parents=True, exist_ok=True) - with open(md_path, "w") as fw: + with open(md_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_md) dt_path.parent.mkdir(parents=True, exist_ok=True) - with open(dt_path, "w") as fw: + with open(dt_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_dt) else: # default branch in test - with open(pages_path) as fr: + with open(pages_path, encoding="utf-8") as fr: doc_true_pages = PageList.validate_json(fr.read()) - with open(json_path) as fr: + with open(json_path, encoding="utf-8") as fr: doc_true: DsDocument = DsDocument.model_validate_json(fr.read()) - with open(md_path) as fr: + with open(md_path, encoding="utf-8") as fr: doc_true_md = fr.read() - with open(dt_path) as fr: + with open(dt_path, encoding="utf-8") as fr: doc_true_dt = fr.read() if not fuzzy: @@ -408,33 +408,33 @@ def verify_conversion_result_v2( if generate: # only used when re-generating truth pages_path.parent.mkdir(parents=True, exist_ok=True) - with open(pages_path, "w") as fw: + with open(pages_path, mode="w", encoding="utf-8") as fw: fw.write( json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent) ) json_path.parent.mkdir(parents=True, exist_ok=True) - with open(json_path, "w") as fw: + with open(json_path, mode="w", encoding="utf-8") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent)) md_path.parent.mkdir(parents=True, exist_ok=True) - with open(md_path, "w") as fw: + with open(md_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_md) dt_path.parent.mkdir(parents=True, exist_ok=True) - with open(dt_path, "w") as fw: + with open(dt_path, mode="w", encoding="utf-8") as fw: fw.write(doc_pred_dt) else: # default branch in test - with open(pages_path) as fr: + with open(pages_path, encoding="utf-8") as fr: doc_true_pages = PageList.validate_json(fr.read()) - with open(json_path) as fr: + with open(json_path, encoding="utf-8") as fr: doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read()) - with open(md_path) as fr: + with open(md_path, encoding="utf-8") as fr: doc_true_md = fr.read() - with open(dt_path) as fr: + with open(dt_path, encoding="utf-8") as fr: doc_true_dt = fr.read() if not fuzzy: @@ -461,12 +461,12 @@ def verify_conversion_result_v2( def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False): if not os.path.exists(gtfile) or generate: - with open(gtfile, "w") as fw: + with open(gtfile, mode="w", encoding="utf-8") as fw: json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2) return True else: - with open(gtfile) as fr: + with open(gtfile, encoding="utf-8") as fr: true_doc = DoclingDocument.model_validate_json(fr.read()) return verify_docitems(pred_doc, true_doc, fuzzy=False) @@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool: file = Path(gtfile) if not file.exists() or generate: - with file.open("w") as fw: + with file.open(mode="w", encoding="utf-8") as fw: fw.write(pred_text) return True - with file.open("r") as fr: + with file.open(encoding="utf-8") as fr: true_text = fr.read() return pred_text == true_text