Change code to use unordered/ordered list, robustifications

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-11 14:53:38 +02:00
parent 3ee97c42b2
commit 95c1f80087
14 changed files with 111 additions and 78 deletions

View File

@@ -6,13 +6,8 @@ from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)

View File

@@ -2,14 +2,11 @@ import json
import logging
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import (
DocumentConverter,
FormatOption,
PdfFormatOption,
WordFormatOption,
)
@@ -40,6 +37,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(