Big refactoring for legacy_document support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-14 16:36:11 +02:00
parent 08ab628e75
commit 497ddb34a8
54 changed files with 1198 additions and 590 deletions

View File

@@ -31,37 +31,12 @@ def export_documents(
success_count += 1
doc_filename = conv_res.input.file.stem
if USE_LEGACY:
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
if USE_V2:
# Export Docling document format to JSON (experimental):
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(
json.dumps(
conv_res.output.model_dump(
conv_res.document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
)
@@ -71,7 +46,7 @@ def export_documents(
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
fp.write(
yaml.safe_dump(
conv_res.output.model_dump(
conv_res.document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
)
@@ -79,15 +54,42 @@ def export_documents(
# Export Docling document format to doctags (experimental):
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
fp.write(conv_res.output.export_to_document_tokens())
fp.write(conv_res.document.export_to_document_tokens())
# Export Docling document format to markdown (experimental):
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.output.export_to_markdown())
fp.write(conv_res.document.export_to_markdown())
# Export Docling document format to text (experimental):
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.output.export_to_markdown(strict_text=True))
fp.write(conv_res.document.export_to_markdown(strict_text=True))
if USE_LEGACY:
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(
conv_res.legacy_document.export_to_markdown(strict_text=True)
)
# Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_doctags())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(

View File

@@ -119,19 +119,19 @@ def main():
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(conv_result.output.export_to_dict()))
fp.write(json.dumps(conv_result.document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_result.output.export_to_text())
fp.write(conv_result.document.export_to_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_result.output.export_to_markdown())
fp.write(conv_result.document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
fp.write(conv_result.output.export_to_document_tokens())
fp.write(conv_result.document.export_to_document_tokens())
if __name__ == "__main__":

View File

@@ -26,7 +26,7 @@ def main():
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.legacy_output.tables):
for table_ix, table in enumerate(conv_res.document.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())

View File

@@ -3,6 +3,8 @@ from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
result = converter.convert(source)
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
print(
result.document.export_to_markdown()
) # output: ## Docling Technical Report [...]"
# if the legacy output is needed, use this version
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
# print(result.legacy_output.export_to_markdown()) # output: ## Docling Technical Report [...]"

View File

@@ -61,7 +61,7 @@ for res in conv_results:
# print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental):
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.output.export_to_markdown())
fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.output.export_to_dict()))
fp.write(json.dumps(res.document.export_to_dict()))