mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
Big refactoring for legacy_document support
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -31,37 +31,12 @@ def export_documents(
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
if USE_LEGACY:
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.legacy.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.legacy.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.legacy.md").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
if USE_V2:
|
||||
# Export Docling document format to JSON (experimental):
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
fp.write(
|
||||
json.dumps(
|
||||
conv_res.output.model_dump(
|
||||
conv_res.document.model_dump(
|
||||
mode="json", by_alias=True, exclude_none=True
|
||||
)
|
||||
)
|
||||
@@ -71,7 +46,7 @@ def export_documents(
|
||||
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
|
||||
fp.write(
|
||||
yaml.safe_dump(
|
||||
conv_res.output.model_dump(
|
||||
conv_res.document.model_dump(
|
||||
mode="json", by_alias=True, exclude_none=True
|
||||
)
|
||||
)
|
||||
@@ -79,15 +54,42 @@ def export_documents(
|
||||
|
||||
# Export Docling document format to doctags (experimental):
|
||||
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
|
||||
fp.write(conv_res.output.export_to_document_tokens())
|
||||
fp.write(conv_res.document.export_to_document_tokens())
|
||||
|
||||
# Export Docling document format to markdown (experimental):
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
fp.write(conv_res.output.export_to_markdown())
|
||||
fp.write(conv_res.document.export_to_markdown())
|
||||
|
||||
# Export Docling document format to text (experimental):
|
||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||
fp.write(conv_res.output.export_to_markdown(strict_text=True))
|
||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
||||
|
||||
if USE_LEGACY:
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.legacy.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.legacy.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(
|
||||
conv_res.legacy_document.export_to_markdown(strict_text=True)
|
||||
)
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.legacy.md").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.legacy_document.export_to_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.legacy_document.export_to_doctags())
|
||||
|
||||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
|
||||
_log.info(
|
||||
|
||||
@@ -119,19 +119,19 @@ def main():
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(conv_result.output.export_to_dict()))
|
||||
fp.write(json.dumps(conv_result.document.export_to_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_result.output.export_to_text())
|
||||
fp.write(conv_result.document.export_to_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_result.output.export_to_markdown())
|
||||
fp.write(conv_result.document.export_to_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_result.output.export_to_document_tokens())
|
||||
fp.write(conv_result.document.export_to_document_tokens())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -26,7 +26,7 @@ def main():
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export tables
|
||||
for table_ix, table in enumerate(conv_res.legacy_output.tables):
|
||||
for table_ix, table in enumerate(conv_res.document.tables):
|
||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
|
||||
@@ -3,6 +3,8 @@ from docling.document_converter import DocumentConverter
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(source)
|
||||
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
|
||||
print(
|
||||
result.document.export_to_markdown()
|
||||
) # output: ## Docling Technical Report [...]"
|
||||
# if the legacy output is needed, use this version
|
||||
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
|
||||
# print(result.legacy_output.export_to_markdown()) # output: ## Docling Technical Report [...]"
|
||||
|
||||
@@ -61,7 +61,7 @@ for res in conv_results:
|
||||
# print(res.experimental.export_to_markdown())
|
||||
# Export Docling document format to markdown (experimental):
|
||||
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
|
||||
fp.write(res.output.export_to_markdown())
|
||||
fp.write(res.document.export_to_markdown())
|
||||
|
||||
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
|
||||
fp.write(json.dumps(res.output.export_to_dict()))
|
||||
fp.write(json.dumps(res.document.export_to_dict()))
|
||||
|
||||
Reference in New Issue
Block a user