mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
This change enables users to extend the document conversion process with custom logic through plugins. - Introduced a PluginManager to handle preprocessing and postprocessing plugins in DocumentConverter. - Updated DocumentConverter to accept and initialize a list of plugins. - Implemented plugin execution within the document processing pipeline, enabling custom modifications before and after conversion. - Updated ConversionResult to include metadata about the plugins used during conversion. - Updated the CLI to accept plugin paths and load them dynamically. - Expanded documentation with examples for creating and using plugins. - Added test cases to verify plugin integration and ensure correct execution in various scenarios. Signed-off-by: Ayoub El Bouchtili <Ayoub.elbouchtili@gmail.com>
66 lines
2.3 KiB
Python
66 lines
2.3 KiB
Python
from datetime import datetime
|
|
from docling.datamodel.document import InputDocument, ConversionResult
|
|
from docling.document_converter import DocumentConverter
|
|
from docling.plugins import DoclingPlugin, PluginMetadata
|
|
|
|
class BasicPlugin(DoclingPlugin):
|
|
"""Example plugin that adds metadata and modifies text."""
|
|
|
|
def __init__(self):
|
|
super().__init__(
|
|
name="BasicPlugin",
|
|
metadata=PluginMetadata(
|
|
version="0.1.0",
|
|
description="A basic plugin that adds processing metadata and modifies text after conversion.",
|
|
author="Ayoub EL BOUCHTILI",
|
|
preprocess={},
|
|
postprocess={}
|
|
)
|
|
)
|
|
|
|
def preprocess(self, input_doc: InputDocument) -> InputDocument:
|
|
"""Add custom metadata during preprocessing."""
|
|
if not hasattr(input_doc, '_plugin_metadata'):
|
|
input_doc._plugin_metadata = {}
|
|
|
|
self.metadata.preprocess = {
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
return input_doc
|
|
|
|
def postprocess(self, result: ConversionResult) -> ConversionResult:
|
|
"""Add metadata during postprocessing and modify text."""
|
|
|
|
extra_text = f"[Processed by {self.name}]"
|
|
|
|
if result.document and result.document.texts:
|
|
# Add a note to the first text item
|
|
first_text = result.document.texts[0]
|
|
first_text.text = f"{extra_text} {first_text.text}"
|
|
|
|
# Update postprocessing metadata properly
|
|
self.metadata.postprocess = {
|
|
"appended_text": extra_text,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
# Append plugin metadata to the result
|
|
if self.name not in result.plugins:
|
|
result.plugins[self.name] = self.metadata.model_dump()
|
|
|
|
return result
|
|
|
|
def main():
|
|
# Create plugin instance
|
|
basic_plugin = BasicPlugin()
|
|
|
|
# Initialize converter with a plugin
|
|
converter = DocumentConverter(plugins=[basic_plugin])
|
|
|
|
# Convert a document
|
|
result = converter.convert("./tests/data/docx/word_sample.docx")
|
|
print(f"Conversion completed with status: {result.status}")
|
|
print(f"Plugins metadata: {result.plugins}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |