Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-08 14:42:33 +02:00
49 changed files with 1621 additions and 1432 deletions

View File

@@ -8,7 +8,6 @@ import yaml
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@@ -31,19 +30,23 @@ def export_documents(
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
if USE_EXPERIMENTAL:

View File

@@ -5,7 +5,6 @@ from pathlib import Path
from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.pdf_document_converter import PdfDocumentConverter
@@ -28,19 +27,23 @@ def export_documents(
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
else:
@@ -82,7 +85,7 @@ def main():
# PyPdfium with OCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True

View File

@@ -1,12 +1,10 @@
import logging
import time
from pathlib import Path
from typing import Tuple
from docling.datamodel.base_models import (
ConversionStatus,
FigureElement,
PageElement,
PdfPipelineOptions,
Table,
)

View File

@@ -1,7 +1,6 @@
import logging
import time
from pathlib import Path
from typing import Tuple
import pandas as pd

View File

@@ -1,22 +1,8 @@
import json
import logging
from pathlib import Path
from typing import Iterable
import yaml
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
InputFormat,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)