docling/examples/run_with_formats.py
Maxim Lysak cefc34e8d8 Working on a first version of DOCX native backend
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
2024-10-04 18:19:40 +02:00

43 lines
1.5 KiB
Python

from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
InputFormat,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
input_paths = [
# Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
# Path("tests/data/powerpoint_sample.pptx"),
# Path("tests/data/2206.01062.pdf"),
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[InputFormat.PDF, InputFormat.DOCX],
# format_options={
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
# }
# )
conv_results = doc_converter.convert(input)
for res in conv_results:
print(
f"Document {res.input.file.name} converted with status {res.status}. Content:"
)
print(res.experimental.export_to_markdown())