mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(docx): add text formatting and hyperlink support (#630)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m27s
Run Docs CI / build-docs (push) Failing after 52s
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m27s
Run Docs CI / build-docs (push) Failing after 52s
* feat: Enable markdown text formatting for docx Signed-off-by: SimJeg <sjegou@nvidia.com> * Fix imports Signed-off-by: SimJeg <sjegou@nvidia.com> * Use Formatting Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle hyperlink Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle formatting properly for DocItemLabel.PARAGRAPH Signed-off-by: SimJeg <sjegou@nvidia.com> * Use inline group Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle bullet lists Signed-off-by: SimJeg <sjegou@nvidia.com> * Strip elements Signed-off-by: SimJeg <sjegou@nvidia.com> * Strip elements Signed-off-by: SimJeg <sjegou@nvidia.com> * Run black and mypy Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle header and footer Signed-off-by: SimJeg <sjegou@nvidia.com> * Use inline_fmt everywhere Signed-off-by: SimJeg <sjegou@nvidia.com> * Run precommit Signed-off-by: SimJeg <sjegou@nvidia.com> * Address feedback Signed-off-by: SimJeg <sjegou@nvidia.com> * Fix add_list_item Signed-off-by: SimJeg <sjegou@nvidia.com> * fix minor bugs, mark helper methods internal Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: SimJeg <sjegou@nvidia.com> Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
@@ -76,17 +76,19 @@ def test_e2e_docx_conversions():
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
||||
assert verify_export(
|
||||
pred_md, str(gt_path) + ".md", generate=GENERATE
|
||||
), "export to md"
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
|
||||
), "export to indented-text"
|
||||
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
doc, str(gt_path) + ".json", generate=GENERATE
|
||||
), "document document"
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
|
||||
Reference in New Issue
Block a user