mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
Change code to use unordered/ordered list, robustifications
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -6,13 +6,8 @@ from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -2,14 +2,11 @@ import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
PdfFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
@@ -40,6 +37,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
||||
# InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
|
||||
Reference in New Issue
Block a user