diff --git a/docling/cli/main.py b/docling/cli/main.py index e1ce289e..e3ba1331 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -41,6 +41,7 @@ ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption +from docling.plugins import DoclingPlugin warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") @@ -146,6 +147,26 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]: return re.split(r"[;,]", raw) +def _load_plugin(plugin_spec: str) -> DoclingPlugin: + """Load a plugin from a module path specification. + + Format: 'module.path:PluginClass' + Example: 'myapp.plugins:CustomPlugin' + """ + try: + module_path, class_name = plugin_spec.split(":") + module = importlib.import_module(module_path) + plugin_class = getattr(module, class_name) + + if not issubclass(plugin_class, DoclingPlugin): + raise ValueError(f"Class {class_name} is not a DoclingPlugin subclass") + + return plugin_class() + except Exception as e: + err_console.print(f"[red]Error loading plugin {plugin_spec}: {str(e)}[/red]") + raise typer.Abort() + + @app.command(no_args_is_help=True) def convert( input_sources: Annotated[ @@ -268,6 +289,14 @@ def convert( device: Annotated[ AcceleratorDevice, typer.Option(..., help="Accelerator device") ] = AcceleratorDevice.AUTO, + plugins: Annotated[ + Optional[List[str]], + typer.Option( + None, + "--plugin", "-p", + help="Names of plugins to use during conversion. Must be in the format 'module.path:PluginClass'", + ), + ] = None, ): if verbose == 0: logging.basicConfig(level=logging.WARNING) @@ -394,9 +423,23 @@ def convert( InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, } + + loaded_plugins = [] + if plugins: + for plugin_spec in plugins: + try: + plugin = _load_plugin(plugin_spec) + loaded_plugins.append(plugin) + except Exception as e: + if abort_on_error: + raise + _log.warning(f"Failed to load plugin {plugin_spec}: {e}") + continue + doc_converter = DocumentConverter( allowed_formats=from_formats, format_options=format_options, + plugins=loaded_plugins, ) start_time = time.time() diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 136428e8..7f4385d2 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -5,6 +5,7 @@ from pathlib import Path, PurePath from typing import ( TYPE_CHECKING, + Any, Dict, Iterable, List, @@ -44,7 +45,7 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.utils.file import resolve_source_to_stream from docling_core.utils.legacy import docling_document_to_legacy -from pydantic import BaseModel +from pydantic import BaseModel, Field from typing_extensions import deprecated from docling.backend.abstract_backend import ( @@ -104,7 +105,7 @@ class InputDocument(BaseModel): filesize: Optional[int] = None page_count: int = 0 - + _backend: AbstractDocumentBackend # Internal PDF backend used def __init__( @@ -198,6 +199,9 @@ class ConversionResult(BaseModel): timings: Dict[str, ProfilingItem] = {} document: DoclingDocument = _EMPTY_DOCLING_DOC + + # Metadata object for tracking plugins details and pre/post processing related data + plugins: Dict[str, Any] = Field(default_factory=dict) @property @deprecated("Use document instead.") diff --git a/docling/document_converter.py b/docling/document_converter.py index cb073949..83f4192f 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -36,6 +36,7 @@ from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.utils.utils import chunkify +from docling.plugins import PluginManager, DoclingPlugin _log = logging.getLogger(__name__) @@ -148,8 +149,9 @@ class DocumentConverter: def __init__( self, - allowed_formats: Optional[List[InputFormat]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None, + allowed_formats: Optional[List[InputFormat]] = None, + plugins: Optional[List[DoclingPlugin]] = None, ): self.allowed_formats = ( allowed_formats if allowed_formats is not None else [e for e in InputFormat] @@ -163,6 +165,11 @@ def __init__( for format in self.allowed_formats } self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} + self.plugin_manager = PluginManager() + + if plugins: + for plugin in plugins: + self.plugin_manager.register_plugin(plugin) def initialize_pipeline(self, format: InputFormat): """Initialize the conversion pipeline for the selected format.""" @@ -280,6 +287,14 @@ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]: def _process_document( self, in_doc: InputDocument, raises_on_error: bool ) -> ConversionResult: + conv_res = ConversionResult(input=in_doc) + + try: + in_doc = self.plugin_manager.execute_preprocessors(in_doc) + except Exception as e: + if raises_on_error: + raise e + _log.error(f"Plugin preprocessing failed: {str(e)}") valid = ( self.allowed_formats is not None and in_doc.format in self.allowed_formats @@ -296,9 +311,15 @@ def _process_document( module_name="", error_message=error_message, ) - conv_res = ConversionResult( - input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item] - ) + conv_res.status = ConversionStatus.SKIPPED + conv_res.errors.append(error_item) + + try: + conv_res = self.plugin_manager.execute_postprocessors(conv_res) + except Exception as e: + if raises_on_error: + raise e + _log.error(f"Plugin postprocessing failed: {str(e)}") return conv_res diff --git a/docling/plugins/__init__.py b/docling/plugins/__init__.py new file mode 100644 index 00000000..41f271d9 --- /dev/null +++ b/docling/plugins/__init__.py @@ -0,0 +1,7 @@ +"""Docling plugin system for extending document processing capabilities.""" + +from .base import DoclingPlugin +from .manager import PluginManager +from .models import PluginMetadata + +__all__ = ["DoclingPlugin", "PluginManager", "PluginMetadata"] \ No newline at end of file diff --git a/docling/plugins/base.py b/docling/plugins/base.py new file mode 100644 index 00000000..79efd8e0 --- /dev/null +++ b/docling/plugins/base.py @@ -0,0 +1,20 @@ +"""Base plugin class for Docling plugins.""" + +from docling.datamodel.document import InputDocument, ConversionResult +from docling.plugins.models import PluginMetadata + +class DoclingPlugin: + """Base class for Docling plugins.""" + + def __init__(self, name: str, metadata: PluginMetadata): + """Initialize the plugin.""" + self.name = name + self.metadata = metadata + + def preprocess(self, input_doc: InputDocument) -> InputDocument: + """Preprocess the input document. Default implementation returns the input unmodified.""" + return input_doc + + def postprocess(self, result: ConversionResult) -> ConversionResult: + """Postprocess the conversion result. Default implementation returns the result unmodified.""" + return result diff --git a/docling/plugins/manager.py b/docling/plugins/manager.py new file mode 100644 index 00000000..6c52bf7c --- /dev/null +++ b/docling/plugins/manager.py @@ -0,0 +1,90 @@ +"""Plugin manager for Docling plugins.""" + +import re +from typing import List, Dict +from docling.datamodel.document import InputDocument, ConversionResult +from docling.plugins.base import DoclingPlugin +from docling.plugins.models import PluginMetadata + +class PluginManager: + """Manages the registration and execution of Docling plugins.""" + + NAME_PATTERN = re.compile(r'^[a-zA-Z][a-zA-Z0-9_-]*$') + + def __init__(self): + self.preprocessors: List[DoclingPlugin] = [] + self.postprocessors: List[DoclingPlugin] = [] + self._registered_names: Dict[str, DoclingPlugin] = {} + + def _validate_plugin_name(self, name: str) -> None: + """Validate plugin name format.""" + if not name or name.isspace(): + raise ValueError("Plugin name cannot be empty or whitespace") + if not self.NAME_PATTERN.match(name): + raise ValueError( + "Plugin name must start with a letter and contain only " + "letters, numbers, underscores, or hyphens" + ) + + def _validate_plugin(self, plugin: DoclingPlugin) -> None: + """Validate all aspects of a plugin.""" + if not isinstance(plugin, DoclingPlugin): + raise ValueError(f"Plugin must be an instance of DoclingPlugin, got {type(plugin)}") + + self._validate_plugin_name(plugin.name) + + if plugin.name in self._registered_names: + raise ValueError(f"A plugin with name '{plugin.name}' is already registered") + + if not plugin.metadata: + raise ValueError(f"Plugin '{plugin.name}' must have metadata") + + # Validate metadata against PluginMetadata model + try: + # Convert metadata to dict if it's already a PluginMetadata instance + metadata_dict = ( + plugin.metadata.model_dump() + if isinstance(plugin.metadata, PluginMetadata) + else plugin.metadata + ) + PluginMetadata(**metadata_dict) + except Exception as e: + raise ValueError(f"Invalid metadata for plugin '{plugin.name}': {str(e)}") + + # Check if the plugin implements at least one of the processing steps + if plugin.preprocess.__func__ is DoclingPlugin.preprocess and plugin.postprocess.__func__ is DoclingPlugin.postprocess: + raise ValueError( + f"Plugin '{plugin.name}' must implement at least a preprocessing or postprocessing step" + ) + + def register_plugin(self, plugin: DoclingPlugin) -> None: + """Register a plugin.""" + if plugin is None: + raise ValueError("Plugin cannot be None") + + self._validate_plugin(plugin) + self._registered_names[plugin.name] = plugin + self.preprocessors.append(plugin) + self.postprocessors.append(plugin) + + def _execute_plugins(self, items: List[DoclingPlugin], data): + """Execute a sequence of plugins.""" + for plugin in items: + try: + data = plugin.preprocess(data) if isinstance(data, InputDocument) else plugin.postprocess(data) + except Exception as e: + stage = "preprocessor" if isinstance(data, InputDocument) else "postprocessor" + raise RuntimeError(f"Error in {stage} {plugin.__class__.__name__}: {str(e)}") from e + return data + + def execute_preprocessors(self, input_doc: InputDocument) -> InputDocument: + """Execute all preprocessors.""" + if input_doc is None: + raise ValueError("Input document cannot be None") + return self._execute_plugins(self.preprocessors, input_doc) + + def execute_postprocessors(self, result: ConversionResult) -> ConversionResult: + """Execute all postprocessors.""" + if result is None: + raise ValueError("Conversion result cannot be None") + return self._execute_plugins(self.postprocessors, result) diff --git a/docling/plugins/models.py b/docling/plugins/models.py new file mode 100644 index 00000000..6e9417ea --- /dev/null +++ b/docling/plugins/models.py @@ -0,0 +1,36 @@ +"""Data models for DoclingPlugin.""" + +from typing import Dict, Any +from pydantic import BaseModel, Field + +class PluginMetadata(BaseModel): + """Model for plugin metadata validation. + + Attributes: + version: The plugin version following semantic versioning + description: A brief description of the plugin's functionality + author: The plugin author's name + preprocess: Metadata for preprocessing step + postprocess: Metadata for postprocessing step + """ + version: str = Field( + default="", + pattern=r"^\d+\.\d+\.\d+$", + description="Plugin version (semantic versioning)" + ) + description: str = Field( + default="", + description="Brief description of the plugin" + ) + author: str = Field( + default="", + description="Plugin author's name" + ) + preprocess: Dict[str, Any] = Field( + default_factory=dict, + description="Preprocessing related metadata" + ) + postprocess: Dict[str, Any] = Field( + default_factory=dict, + description="Postprocessing related metadata" + ) \ No newline at end of file diff --git a/docs/examples/plugin_basic.py b/docs/examples/plugin_basic.py new file mode 100644 index 00000000..8cf591d9 --- /dev/null +++ b/docs/examples/plugin_basic.py @@ -0,0 +1,66 @@ +from datetime import datetime +from docling.datamodel.document import InputDocument, ConversionResult +from docling.document_converter import DocumentConverter +from docling.plugins import DoclingPlugin, PluginMetadata + +class BasicPlugin(DoclingPlugin): + """Example plugin that adds metadata and modifies text.""" + + def __init__(self): + super().__init__( + name="BasicPlugin", + metadata=PluginMetadata( + version="0.1.0", + description="A basic plugin that adds processing metadata and modifies text after conversion.", + author="Ayoub EL BOUCHTILI", + preprocess={}, + postprocess={} + ) + ) + + def preprocess(self, input_doc: InputDocument) -> InputDocument: + """Add custom metadata during preprocessing.""" + if not hasattr(input_doc, '_plugin_metadata'): + input_doc._plugin_metadata = {} + + self.metadata.preprocess = { + "timestamp": datetime.now().isoformat() + } + return input_doc + + def postprocess(self, result: ConversionResult) -> ConversionResult: + """Add metadata during postprocessing and modify text.""" + + extra_text = f"[Processed by {self.name}]" + + if result.document and result.document.texts: + # Add a note to the first text item + first_text = result.document.texts[0] + first_text.text = f"{extra_text} {first_text.text}" + + # Update postprocessing metadata properly + self.metadata.postprocess = { + "appended_text": extra_text, + "timestamp": datetime.now().isoformat() + } + + # Append plugin metadata to the result + if self.name not in result.plugins: + result.plugins[self.name] = self.metadata.model_dump() + + return result + +def main(): + # Create plugin instance + basic_plugin = BasicPlugin() + + # Initialize converter with a plugin + converter = DocumentConverter(plugins=[basic_plugin]) + + # Convert a document + result = converter.convert("./tests/data/docx/word_sample.docx") + print(f"Conversion completed with status: {result.status}") + print(f"Plugins metadata: {result.plugins}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docs/examples/plugin_translation.py b/docs/examples/plugin_translation.py new file mode 100644 index 00000000..ea40d32a --- /dev/null +++ b/docs/examples/plugin_translation.py @@ -0,0 +1,105 @@ +from typing import Optional +from datetime import datetime + +from docling.datamodel.document import InputDocument, ConversionResult +from docling.plugins import DoclingPlugin, PluginMetadata +from docling_core.types.doc import TextItem, TableItem + +class TranslationPlugin(DoclingPlugin): + """Plugin that translates document text to a target language.""" + + def __init__(self, target_lang: str, source_lang: Optional[str] = None): + """Initialize the translation plugin. + + Args: + target_lang: Target language code (e.g. 'fr' for French) + source_lang: Optional source language code. If not provided, + will be auto-detected during translation + """ + super().__init__( + name="TranslationPlugin", + metadata=PluginMetadata( + version="0.1.0", + description=f"Translates document text to {target_lang}", + author="Ayoub EL BOUCHTILI", + preprocess={}, + postprocess={} + ) + ) + self.target_lang = target_lang + self.source_lang = source_lang + + def translate_text(self, text: str) -> tuple[str, str]: + """Translate text to target language. + + Args: + text: Text to translate + + Returns: + Tuple of (translated_text, detected_source_lang) + """ + # IMPLEMENT YOUR TRANSLATION LOGIC HERE + # FOR EXAMPLE USING GOOGLE TRANSLATE: + + # from googletrans import Translator + # translator = Translator() + # if self.source_lang: + # result = translator.translate(text, src=self.source_lang, dest=self.target_lang) + # else: + # result = translator.translate(text, dest=self.target_lang) + # return result.text, result.src + + # END OF PLACEHOLDER IMPLEMENTATION + return text, self.source_lang or "en" + + def postprocess(self, result: ConversionResult) -> ConversionResult: + """Translate document text after conversion.""" + + if result.document and result.document.texts: + detected_langs = set() + + # Translate all text items + for element in result.document.iterate_items(): + if isinstance(element[0], TextItem): + # Translate + translated, detected = self.translate_text(element[0].text) + element[0].text = translated + detected_langs.add(detected) + + elif isinstance(element[0], TableItem): + # Handle table cells + for cell in element[0].data.table_cells: + translated, detected = self.translate_text(cell.text) + cell.text = translated + detected_langs.add(detected) + + # Add translation metadata + self.metadata.postprocess = { + "target_language": self.target_lang, + "source_languages": list(detected_langs), + "timestamp": datetime.now().isoformat() + } + + # Add plugin metadata to result + if self.name not in result.plugins: + result.plugins[self.name] = self.metadata.model_dump() + + return result + +def main(): + # Example usage + from docling.document_converter import DocumentConverter + + # Create plugin instance + translation_plugin = TranslationPlugin(target_lang="fr") + + # Initialize converter with plugin + converter = DocumentConverter(plugins=[translation_plugin]) + + # Convert a document + result = converter.convert("./tests/data/docx/word_sample.docx") + print(f"Conversion completed with status: {result.status}") + print(f"Plugin metadata: {result.plugins}") + +if __name__ == "__main__": + main() diff --git a/docs/usage.md b/docs/usage.md index 9a5b555a..6522dcca 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -162,3 +162,90 @@ print(list(chunk_iter)[11]) # } # } ``` + +## Plugins + +Docling supports plugins that can modify documents during preprocessing (before conversion) and conversion results during postprocessing (after conversion). Plugins can be used to add custom metadata, modify text content, or implement custom processing logic. + +### Creating custom plugins + +Create custom plugins by subclassing `DoclingPlugin` and implementing `preprocess` and/or `postprocess` methods: + +```python +from docling.plugins import DoclingPlugin, PluginMetadata +from docling.datamodel.document import InputDocument, ConversionResult + +class MyCustomPlugin(DoclingPlugin): + def __init__(self): + super().__init__( + name="MyCustomPlugin", # Must contain only letters, numbers, underscores, or hyphens + metadata=PluginMetadata( + version="0.1.0", # Must adhere to semantic versioning + description="A custom plugin example", + author="Your Name", + preprocess={}, + postprocess={} + ) + ) + + def preprocess(self, input_doc: InputDocument) -> InputDocument: + # Modify input document before conversion + return input_doc + + def postprocess(self, result: ConversionResult) -> ConversionResult: + # Modify conversion result after conversion + return result +``` + +### Using plugins in Python + +To use plugins with Docling, create a plugin instance and pass it to the DocumentConverter: + +```python +from docling.document_converter import DocumentConverter +from docling.plugins import DoclingPlugin, PluginMetadata + +# Create plugin instance +my_custom_plugin = MyCustomPlugin() + +# Initialize converter with plugins +converter = DocumentConverter(plugins=[my_custom_plugin]) + +# Convert as usual +result = converter.convert("path/to/document.pdf") +``` + +Enriched plugin metadata are accessible through the `plugins` attribute of the conversion result: + +```python +result = converter.convert("path/to/document.pdf") +plugin_metadata = result.plugins["MyCustomPlugin"] +``` + +Since plugins transform the document and conversion result, you can access the modified document and results through the `result` object just like you would without plugins. For example: + +```python +print(result.document.texts[0].text) +``` + +For a complete example of plugin implementation, see [plugin_basic.py](./examples/plugins/plugin_basic.py). + +### Using plugins with CLI + +You can use plugins through the CLI by specifying the module path and plugin class using the `--plugin` (or `-p`) option: + +```console +docling input.pdf --plugin "myapp.plugins:MyCustomPlugin" +``` + +Multiple plugins can be used by repeating the option: + +```console +docling input.pdf -p "myapp.plugins:FirstPlugin" -p "other.module:SecondPlugin" +``` + +The plugin specification must be in the format `module.path:PluginClass`. For example: +- `myapp.plugins:MyCustomPlugin` - loads the MyCustomPlugin class from myapp.plugins module +- `docling.plugins.examples:BasicPlugin` - loads the BasicPlugin from docling.plugins.examples + +Note: The specified plugin module must be importable from your Python environment (i.e., installed or in the Python path). diff --git a/tests/test_plugin_manager.py b/tests/test_plugin_manager.py new file mode 100644 index 00000000..c2eb553b --- /dev/null +++ b/tests/test_plugin_manager.py @@ -0,0 +1,180 @@ +import pytest +from docling.plugins.manager import PluginManager +from docling.plugins.base import DoclingPlugin +from docling.plugins.models import PluginMetadata +from docling.datamodel.document import InputDocument, ConversionResult +from docling.datamodel.base_models import InputFormat +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend + +class BasicTestPlugin(DoclingPlugin): + def __init__(self, name="TestPlugin"): + super().__init__( + name=name, + metadata=PluginMetadata( + version="0.1.0", + description="Test plugin", + author="Test Author" + ) + ) + + def preprocess(self, input_doc: InputDocument) -> InputDocument: + return input_doc + +class PreprocessOnlyPlugin(DoclingPlugin): + def __init__(self): + super().__init__( + name="PreprocessPlugin", + metadata=PluginMetadata( + version="0.1.0", + description="Preprocess only plugin", + author="Test Author" + ) + ) + + def preprocess(self, input_doc: InputDocument) -> InputDocument: + input_doc._test_flag = True + return input_doc + +class PostprocessOnlyPlugin(DoclingPlugin): + def __init__(self): + super().__init__( + name="PostprocessPlugin", + metadata=PluginMetadata( + version="0.1.0", + description="Postprocess only plugin", + author="Test Author" + ) + ) + + def postprocess(self, result: ConversionResult) -> ConversionResult: + result._test_flag = True + return result + +class ErrorPlugin(DoclingPlugin): + def __init__(self): + super().__init__( + name="ErrorPlugin", + metadata=PluginMetadata( + version="0.1.0", + description="Error plugin", + author="Test Author" + ) + ) + + def preprocess(self, input_doc: InputDocument) -> InputDocument: + raise ValueError("Test error") + +@pytest.fixture +def plugin_manager(): + return PluginManager() + +@pytest.fixture +def input_document(): + return InputDocument( + path_or_stream="test.pdf", + format=InputFormat.PDF, + backend=DoclingParseDocumentBackend + ) + +@pytest.fixture +def conversion_result(input_document): + return ConversionResult(input=input_document) + +def test_plugin_name_validation(plugin_manager): + # Test empty name + with pytest.raises(ValueError, match="Plugin name cannot be empty or whitespace"): + plugin_manager._validate_plugin_name("") + + # Test whitespace name + with pytest.raises(ValueError, match="Plugin name cannot be empty or whitespace"): + plugin_manager._validate_plugin_name(" ") + + # Test invalid characters + with pytest.raises(ValueError, match="Plugin name must start with a letter"): + plugin_manager._validate_plugin_name("123plugin") + + # Test valid names + plugin_manager._validate_plugin_name("validPlugin123") + plugin_manager._validate_plugin_name("valid_plugin") + plugin_manager._validate_plugin_name("valid-plugin") + +def test_plugin_validation(plugin_manager): + # Test None plugin + with pytest.raises(ValueError, match="Plugin cannot be None"): + plugin_manager.register_plugin(None) + + # Test invalid plugin type + with pytest.raises(ValueError, match="Plugin must be an instance of DoclingPlugin"): + plugin_manager.register_plugin("not a plugin") + + # Test duplicate plugin name + plugin = BasicTestPlugin() + plugin_manager.register_plugin(plugin) + with pytest.raises(ValueError, match="already registered"): + plugin_manager.register_plugin(BasicTestPlugin()) + + # Test invalid metadata + invalid_plugin = BasicTestPlugin(name="InvalidMetadataPlugin") + invalid_plugin.metadata.version = "invalid" # Invalid semver + with pytest.raises(ValueError, match="Invalid metadata"): + plugin_manager.register_plugin(invalid_plugin) + +def test_preprocess_execution(plugin_manager, input_document): + plugin = PreprocessOnlyPlugin() + plugin_manager.register_plugin(plugin) + + processed_doc = plugin_manager.execute_preprocessors(input_document) + + assert hasattr(processed_doc, '_test_flag') + assert processed_doc._test_flag is True + +def test_postprocess_execution(plugin_manager, conversion_result): + plugin = PostprocessOnlyPlugin() + plugin_manager.register_plugin(plugin) + + processed_result = plugin_manager.execute_postprocessors(conversion_result) + + assert hasattr(processed_result, '_test_flag') + assert processed_result._test_flag is True + +def test_plugin_execution_error_handling(plugin_manager, input_document): + plugin = ErrorPlugin() + plugin_manager.register_plugin(plugin) + + with pytest.raises(RuntimeError, match="Error in preprocessor ErrorPlugin"): + plugin_manager.execute_preprocessors(input_document) + +def test_none_input_validation(plugin_manager): + with pytest.raises(ValueError, match="Input document cannot be None"): + plugin_manager.execute_preprocessors(None) + + with pytest.raises(ValueError, match="Conversion result cannot be None"): + plugin_manager.execute_postprocessors(None) + +def test_multiple_plugins_execution_order(plugin_manager, input_document): + class OrderTestPlugin(DoclingPlugin): + def __init__(self, name, order_list): + super().__init__( + name=name, + metadata=PluginMetadata( + version="0.1.0", + description="Order test plugin", + author="Test Author" + ) + ) + self.order_list = order_list + + def preprocess(self, input_doc: InputDocument) -> InputDocument: + self.order_list.append(self.name) + return input_doc + + execution_order = [] + plugin1 = OrderTestPlugin("Plugin1", execution_order) + plugin2 = OrderTestPlugin("Plugin2", execution_order) + + plugin_manager.register_plugin(plugin1) + plugin_manager.register_plugin(plugin2) + + plugin_manager.execute_preprocessors(input_document) + + assert execution_order == ["Plugin1", "Plugin2"]