mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -21,57 +21,35 @@ input_paths = [
|
||||
Path("tests/data/word_sample.docx"),
|
||||
Path("tests/data/lorem_ipsum.docx"),
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
# Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
]
|
||||
|
||||
## for defaults use:
|
||||
# doc_converter = DocumentConverter()
|
||||
|
||||
## to customize use:
|
||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
allowed_formats=[
|
||||
InputFormat.PDF,
|
||||
# InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
# InputFormat.IMAGE: PdfFormatOption(),
|
||||
},
|
||||
)
|
||||
|
||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
pdf=None,
|
||||
docx=WordFormatOption(
|
||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
allowed_formats=[
|
||||
InputFormat.PDF,
|
||||
# InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
# InputFormat.IMAGE: PdfFormatOption(),
|
||||
},
|
||||
doc_converter = (
|
||||
DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
allowed_formats=[
|
||||
InputFormat.PDF,
|
||||
InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
], # whitelist formats, non-matching files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
conv_results = doc_converter.convert_all(input_paths)
|
||||
|
||||
for res in conv_results:
|
||||
|
||||
Reference in New Issue
Block a user