docling/examples/run_with_formats.py
Christoph Auer 203cf19b1b Lots of improvements
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2024-10-08 16:38:42 +02:00

56 lines
2.0 KiB
Python

import logging
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
_log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"),
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[
# InputFormat.PDF,
# InputFormat.DOCX,
# ], # whitelist formats, other files are ignored.
# format_options={
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
# InputFormat.DOCX: FormatOption(
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
# ),
# # InputFormat.IMAGE: PdfFormatOption(),
# },
# )
conv_results = doc_converter.convert(input)
for res in conv_results:
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
print(
f"Document {res.input.file.name} converted with status {res.status}."
f"\nSaved markdown output to: {str(out_path)}"
)
# print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental):
with out_path.open("w") as fp:
fp.write(res.experimental.export_to_markdown())