tkb_timeshift/claude_n8n/tools/execution_monitor.py

#!/usr/bin/env python3
"""
Execution Monitor - Tools for monitoring N8N workflow executions
Provides real-time monitoring, alerting, and execution management
"""

import time
import json
from typing import Dict, List, Optional, Callable, Any
from datetime import datetime, timedelta
from dataclasses import dataclass
from enum import Enum
from threading import Thread, Event
import logging


class ExecutionStatus(Enum):
    """Execution status enumeration"""
    RUNNING = "running"
    SUCCESS = "success"
    ERROR = "error"
    CANCELLED = "cancelled"
    WAITING = "waiting"


@dataclass
class ExecutionEvent:
    """Represents an execution event"""
    execution_id: str
    workflow_id: str
    status: ExecutionStatus
    timestamp: datetime
    duration: Optional[float] = None
    error_message: Optional[str] = None
    node_data: Optional[Dict] = None


class ExecutionMonitor:
    """Monitors N8N workflow executions and provides real-time insights"""

    def __init__(self, n8n_client, poll_interval: int = 5):
        """Initialize execution monitor"""
        self.client = n8n_client
        self.poll_interval = poll_interval
        self.monitoring = False
        self.stop_event = Event()
        self.callbacks = {
            'on_success': [],
            'on_error': [],
            'on_start': [],
            'on_complete': []
        }
        self.tracked_executions = {}
        self.logger = self._setup_logger()

    def _setup_logger(self) -> logging.Logger:
        """Setup logging for the monitor"""
        logger = logging.getLogger('N8NExecutionMonitor')
        logger.setLevel(logging.INFO)

        if not logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            logger.addHandler(handler)

        return logger

    def add_callback(self, event_type: str, callback: Callable[[ExecutionEvent], None]):
        """Add callback for execution events"""
        if event_type in self.callbacks:
            self.callbacks[event_type].append(callback)
        else:
            raise ValueError(f"Invalid event type: {event_type}")

    def start_monitoring(self, workflow_ids: Optional[List[str]] = None):
        """Start monitoring workflow executions"""
        if self.monitoring:
            self.logger.warning("Monitoring already running")
            return

        self.monitoring = True
        self.stop_event.clear()

        monitor_thread = Thread(target=self._monitor_loop, args=(workflow_ids,))
        monitor_thread.daemon = True
        monitor_thread.start()

        self.logger.info("Execution monitoring started")

    def stop_monitoring(self):
        """Stop monitoring workflow executions"""
        if not self.monitoring:
            return

        self.monitoring = False
        self.stop_event.set()
        self.logger.info("Execution monitoring stopped")

    def _monitor_loop(self, workflow_ids: Optional[List[str]]):
        """Main monitoring loop"""
        while self.monitoring and not self.stop_event.is_set():
            try:
                self._check_executions(workflow_ids)
                time.sleep(self.poll_interval)
            except Exception as e:
                self.logger.error(f"Error in monitoring loop: {e}")
                time.sleep(self.poll_interval)

    def _check_executions(self, workflow_ids: Optional[List[str]]):
        """Check for new or updated executions"""
        try:
            # Get recent executions
            executions = self.client.get_executions(limit=50)

            for execution in executions:
                execution_id = execution.get('id')
                workflow_id = execution.get('workflowId')

                # Filter by workflow IDs if specified
                if workflow_ids and workflow_id not in workflow_ids:
                    continue

                # Check if this is a new or updated execution
                if self._is_execution_updated(execution):
                    event = self._create_execution_event(execution)
                    self._handle_execution_event(event)

                    # Update tracked executions
                    self.tracked_executions[execution_id] = {
                        'status': execution.get('status'),
                        'last_updated': datetime.now()
                    }

        except Exception as e:
            self.logger.error(f"Error checking executions: {e}")

    def _is_execution_updated(self, execution: Dict) -> bool:
        """Check if execution is new or has been updated"""
        execution_id = execution.get('id')
        current_status = execution.get('status')

        if execution_id not in self.tracked_executions:
            return True

        tracked_status = self.tracked_executions[execution_id]['status']
        return current_status != tracked_status

    def _create_execution_event(self, execution: Dict) -> ExecutionEvent:
        """Create ExecutionEvent from execution data"""
        execution_id = execution.get('id')
        workflow_id = execution.get('workflowId')
        status = ExecutionStatus(execution.get('status'))

        # Calculate duration if available
        duration = None
        start_time = execution.get('startedAt')
        finish_time = execution.get('finishedAt')

        if start_time and finish_time:
            start_dt = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
            finish_dt = datetime.fromisoformat(finish_time.replace('Z', '+00:00'))
            duration = (finish_dt - start_dt).total_seconds()

        # Extract error message if available
        error_message = None
        if status == ExecutionStatus.ERROR:
            data = execution.get('data', {})
            if 'resultData' in data and 'error' in data['resultData']:
                error_message = data['resultData']['error'].get('message')

        return ExecutionEvent(
            execution_id=execution_id,
            workflow_id=workflow_id,
            status=status,
            timestamp=datetime.now(),
            duration=duration,
            error_message=error_message,
            node_data=execution.get('data')
        )

    def _handle_execution_event(self, event: ExecutionEvent):
        """Handle execution event and trigger callbacks"""
        self.logger.info(f"Execution {event.execution_id} status: {event.status.value}")

        # Trigger appropriate callbacks
        if event.status == ExecutionStatus.SUCCESS:
            for callback in self.callbacks['on_success']:
                try:
                    callback(event)
                except Exception as e:
                    self.logger.error(f"Error in success callback: {e}")

        elif event.status == ExecutionStatus.ERROR:
            for callback in self.callbacks['on_error']:
                try:
                    callback(event)
                except Exception as e:
                    self.logger.error(f"Error in error callback: {e}")

        elif event.status == ExecutionStatus.RUNNING:
            for callback in self.callbacks['on_start']:
                try:
                    callback(event)
                except Exception as e:
                    self.logger.error(f"Error in start callback: {e}")

        # Always trigger complete callbacks for finished executions
        if event.status in [ExecutionStatus.SUCCESS, ExecutionStatus.ERROR, ExecutionStatus.CANCELLED]:
            for callback in self.callbacks['on_complete']:
                try:
                    callback(event)
                except Exception as e:
                    self.logger.error(f"Error in complete callback: {e}")

    def execute_and_monitor(self, workflow_id: str, test_data: Optional[Dict] = None, timeout: int = 300) -> ExecutionEvent:
        """Execute workflow and monitor until completion"""
        try:
            # Start execution
            result = self.client.execute_workflow(workflow_id, test_data)
            execution_id = result.get('id')

            if not execution_id:
                raise Exception("Failed to get execution ID from workflow execution")

            self.logger.info(f"Started execution {execution_id} for workflow {workflow_id}")

            # Monitor execution until completion
            start_time = time.time()
            while time.time() - start_time < timeout:
                execution = self.client.get_execution(execution_id)
                status = execution.get('status')

                if status in ['success', 'error', 'cancelled']:
                    event = self._create_execution_event(execution)
                    self.logger.info(f"Execution {execution_id} completed with status: {status}")
                    return event

                time.sleep(2)  # Check every 2 seconds

            # Timeout reached
            raise TimeoutError(f"Execution {execution_id} did not complete within {timeout} seconds")

        except Exception as e:
            self.logger.error(f"Error executing and monitoring workflow: {e}")
            raise

    def get_execution_summary(self, hours: int = 24) -> Dict:
        """Get execution summary for the specified time period"""
        try:
            # Get executions from the specified time period
            executions = self.client.get_executions(limit=200)

            # Filter by time period
            cutoff_time = datetime.now() - timedelta(hours=hours)
            recent_executions = []

            for execution in executions:
                started_at = execution.get('startedAt')
                if started_at:
                    exec_time = datetime.fromisoformat(started_at.replace('Z', '+00:00'))
                    if exec_time > cutoff_time:
                        recent_executions.append(execution)

            # Calculate summary statistics
            total = len(recent_executions)
            successful = len([e for e in recent_executions if e.get('status') == 'success'])
            failed = len([e for e in recent_executions if e.get('status') == 'error'])
            running = len([e for e in recent_executions if e.get('status') == 'running'])

            # Calculate average duration
            durations = []
            for execution in recent_executions:
                start_time = execution.get('startedAt')
                finish_time = execution.get('finishedAt')
                if start_time and finish_time:
                    start_dt = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
                    finish_dt = datetime.fromisoformat(finish_time.replace('Z', '+00:00'))
                    durations.append((finish_dt - start_dt).total_seconds())

            avg_duration = sum(durations) / len(durations) if durations else 0

            # Group by workflow
            workflow_stats = {}
            for execution in recent_executions:
                workflow_id = execution.get('workflowId')
                if workflow_id not in workflow_stats:
                    workflow_stats[workflow_id] = {'total': 0, 'success': 0, 'error': 0}

                workflow_stats[workflow_id]['total'] += 1
                status = execution.get('status')
                if status == 'success':
                    workflow_stats[workflow_id]['success'] += 1
                elif status == 'error':
                    workflow_stats[workflow_id]['error'] += 1

            return {
                'time_period_hours': hours,
                'total_executions': total,
                'successful_executions': successful,
                'failed_executions': failed,
                'running_executions': running,
                'success_rate': (successful / total * 100) if total > 0 else 0,
                'average_duration_seconds': avg_duration,
                'workflow_statistics': workflow_stats,
                'executions': recent_executions
            }

        except Exception as e:
            self.logger.error(f"Error generating execution summary: {e}")
            raise

    def wait_for_execution_completion(self, execution_id: str, timeout: int = 300) -> ExecutionEvent:
        """Wait for a specific execution to complete"""
        start_time = time.time()

        while time.time() - start_time < timeout:
            execution = self.client.get_execution(execution_id)
            status = execution.get('status')

            if status in ['success', 'error', 'cancelled']:
                return self._create_execution_event(execution)

            time.sleep(2)

        raise TimeoutError(f"Execution {execution_id} did not complete within {timeout} seconds")

    def cancel_execution(self, execution_id: str) -> bool:
        """Cancel a running execution"""
        try:
            # N8N API might have a cancel endpoint - this would need to be implemented
            # based on the actual API capabilities
            self.logger.warning(f"Cancel execution {execution_id} - implement based on N8N API")
            return True
        except Exception as e:
            self.logger.error(f"Error cancelling execution {execution_id}: {e}")
            return False

    def get_execution_logs(self, execution_id: str) -> Dict:
        """Get detailed logs for an execution"""
        try:
            execution = self.client.get_execution(execution_id)

            logs = {
                'execution_id': execution_id,
                'status': execution.get('status'),
                'workflow_id': execution.get('workflowId'),
                'started_at': execution.get('startedAt'),
                'finished_at': execution.get('finishedAt'),
                'node_logs': [],
                'errors': []
            }

            # Extract node-level logs if available
            data = execution.get('data', {})
            if 'resultData' in data:
                result_data = data['resultData']

                # Extract errors
                if 'error' in result_data:
                    logs['errors'].append(result_data['error'])

                # Extract node execution data
                if 'runData' in result_data:
                    for node_name, node_runs in result_data['runData'].items():
                        for run in node_runs:
                            logs['node_logs'].append({
                                'node': node_name,
                                'start_time': run.get('startTime'),
                                'execution_time': run.get('executionTime'),
                                'data': run.get('data', {}),
                                'error': run.get('error')
                            })

            return logs

        except Exception as e:
            self.logger.error(f"Error getting execution logs: {e}")
            raise


# Utility functions for common monitoring scenarios

def create_simple_monitor(n8n_client) -> ExecutionMonitor:
    """Create a simple execution monitor with basic logging"""
    monitor = ExecutionMonitor(n8n_client)

    def log_success(event: ExecutionEvent):
        print(f"✅ Execution {event.execution_id} completed successfully in {event.duration:.2f}s")

    def log_error(event: ExecutionEvent):
        print(f"❌ Execution {event.execution_id} failed: {event.error_message}")

    def log_start(event: ExecutionEvent):
        print(f"🚀 Execution {event.execution_id} started")

    monitor.add_callback('on_success', log_success)
    monitor.add_callback('on_error', log_error)
    monitor.add_callback('on_start', log_start)

    return monitor


if __name__ == "__main__":
    print("Execution Monitor initialized successfully.")