Update examples and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-09 15:20:27 +02:00
parent 080042d06d
commit 0dfbd0b6fc
25 changed files with 181 additions and 150 deletions

View File

@@ -12,7 +12,7 @@ from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False
USE_EXPERIMENTAL = True
def export_documents(

View File

@@ -7,7 +7,7 @@ from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, FormatOption
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
_log = logging.getLogger(__name__)
@@ -104,9 +104,7 @@ def main():
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
)
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)

View File

@@ -3,9 +3,15 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.document_converter import (
DocumentConverter,
FormatOption,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
@@ -22,23 +28,25 @@ input_paths = [
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
## for defaults use:
# doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[
# InputFormat.PDF,
# InputFormat.DOCX,
# ], # whitelist formats, other files are ignored.
# format_options={
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
# InputFormat.DOCX: FormatOption(
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
# ),
# # InputFormat.IMAGE: PdfFormatOption(),
# },
# )
## to customize use:
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
formats=[
InputFormat.PDF,
InputFormat.DOCX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert(input)