mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
Refactor test data, legacy usage and more
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -12,7 +12,8 @@ from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
USE_EXPERIMENTAL = True
|
||||
USE_V2 = True
|
||||
USE_LEGACY = True
|
||||
|
||||
|
||||
def export_documents(
|
||||
@@ -30,52 +31,63 @@ def export_documents(
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
if USE_LEGACY:
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.legacy.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict_v1()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.legacy.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_text_v1())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.legacy.md").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_markdown_v1())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags_v1())
|
||||
|
||||
if USE_EXPERIMENTAL:
|
||||
if USE_V2:
|
||||
# Export Docling document format to JSON (experimental):
|
||||
with (output_dir / f"{doc_filename}.experimental.json").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
fp.write(
|
||||
json.dumps(
|
||||
conv_res.experimental.model_dump(mode="json", by_alias=True)
|
||||
conv_res.output.model_dump(
|
||||
mode="json", by_alias=True, exclude_none=True
|
||||
)
|
||||
)
|
||||
)
|
||||
) # TODO to be replaced with convenience method
|
||||
|
||||
# Export Docling document format to YAML (experimental):
|
||||
with (output_dir / f"{doc_filename}.experimental.yaml").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
|
||||
fp.write(
|
||||
yaml.safe_dump(
|
||||
conv_res.experimental.model_dump(mode="json", by_alias=True)
|
||||
conv_res.output.model_dump(
|
||||
mode="json", by_alias=True, exclude_none=True
|
||||
)
|
||||
)
|
||||
)
|
||||
) # TODO to be replaced with convenience method
|
||||
|
||||
# Export Docling document format to doctags (experimental):
|
||||
with (output_dir / f"{doc_filename}.experimental.doctags").open(
|
||||
"w"
|
||||
) as fp:
|
||||
fp.write(conv_res.experimental.export_to_document_tokens())
|
||||
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
|
||||
fp.write(conv_res.output.export_to_document_tokens())
|
||||
|
||||
# Export Docling document format to markdown (experimental):
|
||||
with (output_dir / f"{doc_filename}.experimental.md").open("w") as fp:
|
||||
fp.write(conv_res.experimental.export_to_markdown())
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
fp.write(conv_res.output.export_to_markdown())
|
||||
|
||||
# Export Docling document format to text (experimental):
|
||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||
fp.write(conv_res.output.export_to_markdown(strict_text=True))
|
||||
|
||||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
|
||||
_log.info(
|
||||
|
||||
@@ -39,17 +39,17 @@ def export_documents(
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
fp.write(conv_res.render_as_text_v1())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
fp.write(conv_res.render_as_markdown_v1())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
fp.write(conv_res.render_as_doctags_v1())
|
||||
|
||||
else:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
|
||||
@@ -39,7 +39,7 @@ def main():
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export tables
|
||||
for table_ix, table in enumerate(conv_res.output.tables):
|
||||
for table_ix, table in enumerate(conv_res.legacy_output.tables):
|
||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
|
||||
@@ -3,4 +3,4 @@ from docling.document_converter import DocumentConverter
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
doc = converter.convert_single(source)
|
||||
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
||||
print(doc.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
|
||||
|
||||
@@ -59,4 +59,4 @@ for res in conv_results:
|
||||
# print(res.experimental.export_to_markdown())
|
||||
# Export Docling document format to markdown (experimental):
|
||||
with out_path.open("w") as fp:
|
||||
fp.write(res.experimental.export_to_markdown())
|
||||
fp.write(res.output.export_to_markdown())
|
||||
|
||||
Reference in New Issue
Block a user