mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-18 01:28:09 +00:00
feat: Introduce plugin support for document conversion
This change enables users to extend the document conversion process with custom logic through plugins. - Introduced a PluginManager to handle preprocessing and postprocessing plugins in DocumentConverter. - Updated DocumentConverter to accept and initialize a list of plugins. - Implemented plugin execution within the document processing pipeline, enabling custom modifications before and after conversion. - Updated ConversionResult to include metadata about the plugins used during conversion. - Updated the CLI to accept plugin paths and load them dynamically. - Expanded documentation with examples for creating and using plugins. - Added test cases to verify plugin integration and ensure correct execution in various scenarios. Signed-off-by: Ayoub El Bouchtili <Ayoub.elbouchtili@gmail.com>
This commit is contained in:
66
docs/examples/plugin_basic.py
Normal file
66
docs/examples/plugin_basic.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from datetime import datetime
|
||||
from docling.datamodel.document import InputDocument, ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.plugins import DoclingPlugin, PluginMetadata
|
||||
|
||||
class BasicPlugin(DoclingPlugin):
|
||||
"""Example plugin that adds metadata and modifies text."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
name="BasicPlugin",
|
||||
metadata=PluginMetadata(
|
||||
version="0.1.0",
|
||||
description="A basic plugin that adds processing metadata and modifies text after conversion.",
|
||||
author="Ayoub EL BOUCHTILI",
|
||||
preprocess={},
|
||||
postprocess={}
|
||||
)
|
||||
)
|
||||
|
||||
def preprocess(self, input_doc: InputDocument) -> InputDocument:
|
||||
"""Add custom metadata during preprocessing."""
|
||||
if not hasattr(input_doc, '_plugin_metadata'):
|
||||
input_doc._plugin_metadata = {}
|
||||
|
||||
self.metadata.preprocess = {
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
return input_doc
|
||||
|
||||
def postprocess(self, result: ConversionResult) -> ConversionResult:
|
||||
"""Add metadata during postprocessing and modify text."""
|
||||
|
||||
extra_text = f"[Processed by {self.name}]"
|
||||
|
||||
if result.document and result.document.texts:
|
||||
# Add a note to the first text item
|
||||
first_text = result.document.texts[0]
|
||||
first_text.text = f"{extra_text} {first_text.text}"
|
||||
|
||||
# Update postprocessing metadata properly
|
||||
self.metadata.postprocess = {
|
||||
"appended_text": extra_text,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Append plugin metadata to the result
|
||||
if self.name not in result.plugins:
|
||||
result.plugins[self.name] = self.metadata.model_dump()
|
||||
|
||||
return result
|
||||
|
||||
def main():
|
||||
# Create plugin instance
|
||||
basic_plugin = BasicPlugin()
|
||||
|
||||
# Initialize converter with a plugin
|
||||
converter = DocumentConverter(plugins=[basic_plugin])
|
||||
|
||||
# Convert a document
|
||||
result = converter.convert("./tests/data/docx/word_sample.docx")
|
||||
print(f"Conversion completed with status: {result.status}")
|
||||
print(f"Plugins metadata: {result.plugins}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
105
docs/examples/plugin_translation.py
Normal file
105
docs/examples/plugin_translation.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from docling.datamodel.document import InputDocument, ConversionResult
|
||||
from docling.plugins import DoclingPlugin, PluginMetadata
|
||||
from docling_core.types.doc import TextItem, TableItem
|
||||
|
||||
class TranslationPlugin(DoclingPlugin):
|
||||
"""Plugin that translates document text to a target language."""
|
||||
|
||||
def __init__(self, target_lang: str, source_lang: Optional[str] = None):
|
||||
"""Initialize the translation plugin.
|
||||
|
||||
Args:
|
||||
target_lang: Target language code (e.g. 'fr' for French)
|
||||
source_lang: Optional source language code. If not provided,
|
||||
will be auto-detected during translation
|
||||
"""
|
||||
super().__init__(
|
||||
name="TranslationPlugin",
|
||||
metadata=PluginMetadata(
|
||||
version="0.1.0",
|
||||
description=f"Translates document text to {target_lang}",
|
||||
author="Ayoub EL BOUCHTILI",
|
||||
preprocess={},
|
||||
postprocess={}
|
||||
)
|
||||
)
|
||||
self.target_lang = target_lang
|
||||
self.source_lang = source_lang
|
||||
|
||||
def translate_text(self, text: str) -> tuple[str, str]:
|
||||
"""Translate text to target language.
|
||||
|
||||
Args:
|
||||
text: Text to translate
|
||||
|
||||
Returns:
|
||||
Tuple of (translated_text, detected_source_lang)
|
||||
"""
|
||||
# IMPLEMENT YOUR TRANSLATION LOGIC HERE
|
||||
# FOR EXAMPLE USING GOOGLE TRANSLATE:
|
||||
|
||||
# from googletrans import Translator
|
||||
# translator = Translator()
|
||||
# if self.source_lang:
|
||||
# result = translator.translate(text, src=self.source_lang, dest=self.target_lang)
|
||||
# else:
|
||||
# result = translator.translate(text, dest=self.target_lang)
|
||||
# return result.text, result.src
|
||||
|
||||
# END OF PLACEHOLDER IMPLEMENTATION
|
||||
return text, self.source_lang or "en"
|
||||
|
||||
def postprocess(self, result: ConversionResult) -> ConversionResult:
|
||||
"""Translate document text after conversion."""
|
||||
|
||||
if result.document and result.document.texts:
|
||||
detected_langs = set()
|
||||
|
||||
# Translate all text items
|
||||
for element in result.document.iterate_items():
|
||||
if isinstance(element[0], TextItem):
|
||||
# Translate
|
||||
translated, detected = self.translate_text(element[0].text)
|
||||
element[0].text = translated
|
||||
detected_langs.add(detected)
|
||||
|
||||
elif isinstance(element[0], TableItem):
|
||||
# Handle table cells
|
||||
for cell in element[0].data.table_cells:
|
||||
translated, detected = self.translate_text(cell.text)
|
||||
cell.text = translated
|
||||
detected_langs.add(detected)
|
||||
|
||||
# Add translation metadata
|
||||
self.metadata.postprocess = {
|
||||
"target_language": self.target_lang,
|
||||
"source_languages": list(detected_langs),
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Add plugin metadata to result
|
||||
if self.name not in result.plugins:
|
||||
result.plugins[self.name] = self.metadata.model_dump()
|
||||
|
||||
return result
|
||||
|
||||
def main():
|
||||
# Example usage
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
# Create plugin instance
|
||||
translation_plugin = TranslationPlugin(target_lang="fr")
|
||||
|
||||
# Initialize converter with plugin
|
||||
converter = DocumentConverter(plugins=[translation_plugin])
|
||||
|
||||
# Convert a document
|
||||
result = converter.convert("./tests/data/docx/word_sample.docx")
|
||||
print(f"Conversion completed with status: {result.status}")
|
||||
print(f"Plugin metadata: {result.plugins}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user