docling/examples/run_with_formats.py
Christoph Auer c0447206af Merge from main
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2024-10-08 14:42:33 +02:00

43 lines
1.4 KiB
Python

import logging
from pathlib import Path
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"),
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[InputFormat.PDF, InputFormat.DOCX],
# format_options={
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
# }
# )
conv_results = doc_converter.convert(input)
for res in conv_results:
print("")
print(
f"Document {res.input.file.name} converted with status {res.status}. Content:"
)
print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental):
with (Path("./scratch") / f"{res.input.file.name}.experimental.md").open("w") as fp:
fp.write(res.experimental.export_to_markdown())