mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
Update examples and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -12,7 +12,7 @@ from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
USE_EXPERIMENTAL = False
|
||||
USE_EXPERIMENTAL = True
|
||||
|
||||
|
||||
def export_documents(
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import Iterable
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, FormatOption
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -104,9 +104,7 @@ def main():
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
|
||||
)
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -3,9 +3,15 @@ from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
PdfFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
@@ -22,23 +28,25 @@ input_paths = [
|
||||
]
|
||||
input = DocumentConversionInput.from_paths(input_paths)
|
||||
|
||||
# for defaults use:
|
||||
doc_converter = DocumentConverter()
|
||||
## for defaults use:
|
||||
# doc_converter = DocumentConverter()
|
||||
|
||||
# to customize use:
|
||||
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
# formats=[
|
||||
# InputFormat.PDF,
|
||||
# InputFormat.DOCX,
|
||||
# ], # whitelist formats, other files are ignored.
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
|
||||
# InputFormat.DOCX: FormatOption(
|
||||
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
|
||||
# ),
|
||||
# # InputFormat.IMAGE: PdfFormatOption(),
|
||||
# },
|
||||
# )
|
||||
## to customize use:
|
||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
formats=[
|
||||
InputFormat.PDF,
|
||||
InputFormat.DOCX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
# InputFormat.IMAGE: PdfFormatOption(),
|
||||
},
|
||||
)
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user