feat: Introduce plugin support for document conversion

This change enables users to extend the document conversion process with custom logic through plugins. - Introduced a PluginManager to handle preprocessing and postprocessing plugins in DocumentConverter. - Updated DocumentConverter to accept and initialize a list of plugins. - Implemented plugin execution within the document processing pipeline, enabling custom modifications before and after conversion. - Updated ConversionResult to include metadata about the plugins used during conversion. - Updated the CLI to accept plugin paths and load them dynamically. - Expanded documentation with examples for creating and using plugins. - Added test cases to verify plugin integration and ensure correct execution in various scenarios. Signed-off-by: Ayoub El Bouchtili <Ayoub.elbouchtili@gmail.com>
2025-12-18 01:28:09 +00:00 · 2025-01-18 15:49:51 +01:00
parent 2cbc5ce521
commit 224d633b7e
11 changed files with 665 additions and 6 deletions
--- a/docs/examples/plugin_basic.py
+++ b/docs/examples/plugin_basic.py
@@ -0,0 +1,66 @@
+from datetime import datetime
+from docling.datamodel.document import InputDocument, ConversionResult
+from docling.document_converter import DocumentConverter
+from docling.plugins import DoclingPlugin, PluginMetadata
+
+class BasicPlugin(DoclingPlugin):
+    """Example plugin that adds metadata and modifies text."""
+    
+    def __init__(self):
+        super().__init__(
+            name="BasicPlugin",
+            metadata=PluginMetadata(
+                version="0.1.0",
+                description="A basic plugin that adds processing metadata and modifies text after conversion.",
+                author="Ayoub EL BOUCHTILI",
+                preprocess={},
+                postprocess={}
+            )
+        )
+    
+    def preprocess(self, input_doc: InputDocument) -> InputDocument:
+        """Add custom metadata during preprocessing."""
+        if not hasattr(input_doc, '_plugin_metadata'):
+            input_doc._plugin_metadata = {}
+        
+        self.metadata.preprocess = {
+            "timestamp": datetime.now().isoformat()
+        }
+        return input_doc
+    
+    def postprocess(self, result: ConversionResult) -> ConversionResult:
+        """Add metadata during postprocessing and modify text."""
+        
+        extra_text = f"[Processed by {self.name}]"
+        
+        if result.document and result.document.texts:
+            # Add a note to the first text item
+            first_text = result.document.texts[0]
+            first_text.text = f"{extra_text} {first_text.text}"
+
+        # Update postprocessing metadata properly
+        self.metadata.postprocess = {
+            "appended_text": extra_text,
+            "timestamp": datetime.now().isoformat()
+        }
+
+        # Append plugin metadata to the result
+        if self.name not in result.plugins:
+            result.plugins[self.name] = self.metadata.model_dump()
+            
+        return result
+
+def main():
+    # Create plugin instance
+    basic_plugin = BasicPlugin()
+
+    # Initialize converter with a plugin
+    converter = DocumentConverter(plugins=[basic_plugin])
+
+    # Convert a document
+    result = converter.convert("./tests/data/docx/word_sample.docx")
+    print(f"Conversion completed with status: {result.status}")
+    print(f"Plugins metadata: {result.plugins}")
+
+if __name__ == "__main__":
+    main() 
--- a/docs/examples/plugin_translation.py
+++ b/docs/examples/plugin_translation.py
@@ -0,0 +1,105 @@
+from typing import Optional
+from datetime import datetime
+
+from docling.datamodel.document import InputDocument, ConversionResult
+from docling.plugins import DoclingPlugin, PluginMetadata
+from docling_core.types.doc import TextItem, TableItem
+
+class TranslationPlugin(DoclingPlugin):
+    """Plugin that translates document text to a target language."""
+    
+    def __init__(self, target_lang: str, source_lang: Optional[str] = None):
+        """Initialize the translation plugin.
+        
+        Args:
+            target_lang: Target language code (e.g. 'fr' for French)
+            source_lang: Optional source language code. If not provided,
+                        will be auto-detected during translation
+        """
+        super().__init__(
+            name="TranslationPlugin",
+            metadata=PluginMetadata(
+                version="0.1.0",
+                description=f"Translates document text to {target_lang}",
+                author="Ayoub EL BOUCHTILI",
+                preprocess={},
+                postprocess={}
+            )
+        )
+        self.target_lang = target_lang
+        self.source_lang = source_lang
+        
+    def translate_text(self, text: str) -> tuple[str, str]:
+        """Translate text to target language.
+        
+        Args:
+            text: Text to translate
+            
+        Returns:
+            Tuple of (translated_text, detected_source_lang)
+        """
+        # IMPLEMENT YOUR TRANSLATION LOGIC HERE
+        # FOR EXAMPLE USING GOOGLE TRANSLATE:
+
+        # from googletrans import Translator
+        # translator = Translator()
+        # if self.source_lang:
+        #     result = translator.translate(text, src=self.source_lang, dest=self.target_lang)
+        # else:
+        #     result = translator.translate(text, dest=self.target_lang)
+        # return result.text, result.src
+
+        # END OF PLACEHOLDER IMPLEMENTATION
+        return text, self.source_lang or "en"
+    
+    def postprocess(self, result: ConversionResult) -> ConversionResult:
+        """Translate document text after conversion."""
+        
+        if result.document and result.document.texts:
+            detected_langs = set()
+            
+            # Translate all text items
+            for element in result.document.iterate_items():
+                if isinstance(element[0], TextItem):
+                    # Translate
+                    translated, detected = self.translate_text(element[0].text)
+                    element[0].text = translated
+                    detected_langs.add(detected)
+                    
+                elif isinstance(element[0], TableItem):
+                    # Handle table cells
+                    for cell in element[0].data.table_cells:
+                        translated, detected = self.translate_text(cell.text)
+                        cell.text = translated
+                        detected_langs.add(detected)
+
+        # Add translation metadata
+        self.metadata.postprocess = {
+            "target_language": self.target_lang,
+            "source_languages": list(detected_langs),
+            "timestamp": datetime.now().isoformat()
+        }
+        
+        # Add plugin metadata to result
+        if self.name not in result.plugins:
+            result.plugins[self.name] = self.metadata.model_dump()
+            
+        return result
+
+def main():
+    # Example usage
+    from docling.document_converter import DocumentConverter
+    
+    # Create plugin instance
+    translation_plugin = TranslationPlugin(target_lang="fr")
+    
+    # Initialize converter with plugin
+    converter = DocumentConverter(plugins=[translation_plugin])
+    
+    # Convert a document
+    result = converter.convert("./tests/data/docx/word_sample.docx")
+    print(f"Conversion completed with status: {result.status}")
+    print(f"Plugin metadata: {result.plugins}")
+
+if __name__ == "__main__":
+    main()