Enrich simple pipeline
In [ ]:
Copied!
import logging
from pathlib import Path
import logging
from pathlib import Path
In [ ]:
Copied!
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.document_converter import (
DocumentConverter,
HTMLFormatOption,
WordFormatOption,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.document_converter import (
DocumentConverter,
HTMLFormatOption,
WordFormatOption,
)
In [ ]:
Copied!
_log = logging.getLogger(__name__)
_log = logging.getLogger(__name__)
In [ ]:
Copied!
def main():
input_path = Path("tests/data/docx/word_sample.docx")
pipeline_options = ConvertPipelineOptions()
pipeline_options.do_picture_classification = True
pipeline_options.do_picture_description = True
doc_converter = DocumentConverter(
format_options={
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
},
)
res = doc_converter.convert(input_path)
print(res.document.export_to_markdown())
def main():
input_path = Path("tests/data/docx/word_sample.docx")
pipeline_options = ConvertPipelineOptions()
pipeline_options.do_picture_classification = True
pipeline_options.do_picture_description = True
doc_converter = DocumentConverter(
format_options={
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
},
)
res = doc_converter.convert(input_path)
print(res.document.export_to_markdown())
In [ ]:
Copied!
if __name__ == "__main__":
main()
if __name__ == "__main__":
main()