Improved demo code, that saves output mds to files

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-07 17:25:17 +02:00
parent bea9fc22af
commit f773d8a621

View File

@ -1,17 +1,27 @@
import json
import logging
from pathlib import Path
from typing import Iterable
import yaml
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
InputFormat,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import DocumentConversionInput
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
_log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False
input_paths = [
# Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
@ -36,7 +46,11 @@ doc_converter = DocumentConverter()
conv_results = doc_converter.convert(input)
for res in conv_results:
print("")
print(
f"Document {res.input.file.name} converted with status {res.status}. Content:"
)
print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental):
with (Path("./scratch") / f"{res.input.file.name}.experimental.md").open("w") as fp:
fp.write(res.experimental.export_to_markdown())