import logging from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import DocumentConversionInput from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline _log = logging.getLogger(__name__) USE_EXPERIMENTAL = False input_paths = [ Path("tests/data/wiki_duck.html"), Path("tests/data/word_sample.docx"), Path("tests/data/lorem_ipsum.docx"), Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/2206.01062.pdf"), ] input = DocumentConversionInput.from_paths(input_paths) # for defaults use: doc_converter = DocumentConverter() # to customize use: # doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. # formats=[ # InputFormat.PDF, # InputFormat.DOCX, # ], # whitelist formats, other files are ignored. # format_options={ # InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend), # InputFormat.DOCX: FormatOption( # pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend # ), # # InputFormat.IMAGE: PdfFormatOption(), # }, # ) conv_results = doc_converter.convert(input) for res in conv_results: out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" print( f"Document {res.input.file.name} converted with status {res.status}." f"\nSaved markdown output to: {str(out_path)}" ) # print(res.experimental.export_to_markdown()) # Export Docling document format to markdown (experimental): with out_path.open("w") as fp: fp.write(res.experimental.export_to_markdown())