Complete observability for AI applications with intelligent monitoring, predictive alerting, and performance insights powered by machine learning.
AI-powered insights
Predictive notifications
Request flow analysis
Intelligent log processing
# AI-Enhanced Prometheus Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'mcpcodex-production'
environment: 'prod'
# AI: Dynamic service discovery
scrape_configs:
# MCPCodex AI Services
- job_name: 'mcpcodex-ai-services'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- production
- staging
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_mcpcodex_ai_monitor]
action: keep
regex: true
- source_labels: [__meta_kubernetes_endpoint_port_name]
action: keep
regex: (metrics|http-metrics)
metrics_path: /metrics
scrape_interval: 10s
# AI Model Performance
- job_name: 'ai-model-metrics'
static_configs:
- targets: ['ai-model-exporter:9090']
metrics_path: /ai-metrics
scrape_interval: 30s
params:
format: ['prometheus']
model: ['claude-3-opus', 'gpt-4']
# Infrastructure Monitoring
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '1:9100'
target_label: __address__
# Application Performance
- job_name: 'mcpcodex-app'
kubernetes_sd_configs:
- role: pod
namespaces:
names: [production]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
# AI-Powered Alerting Rules
rule_files:
- "ai_alerts.yml"
- "performance_rules.yml"
- "security_alerts.yml"
# Alert Manager Configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
{
"dashboard": {
"id": null,
"title": "MCPCodex AI Monitoring Dashboard",
"tags": ["mcpcodex", "ai", "monitoring"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "AI Model Response Times",
"type": "timeseries",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(mcpcodex_ai_request_duration_seconds_bucket[5m])) by (le, model))",
"legendFormat": "95th percentile - {{model}}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.50, sum(rate(mcpcodex_ai_request_duration_seconds_bucket[5m])) by (le, model))",
"legendFormat": "50th percentile - {{model}}",
"refId": "B"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"unit": "s"
}
}
},
{
"id": 2,
"title": "Context Processing Rate",
"type": "stat",
"targets": [
{
"expr": "sum(rate(mcpcodex_contexts_processed_total[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 100},
{"color": "green", "value": 500}
]
},
"unit": "reqps"
}
}
},
{
"id": 3,
"title": "AI Agent Health",
"type": "heatmap",
"targets": [
{
"expr": "mcpcodex_ai_agent_health_score",
"legendFormat": "Agent {{agent_id}}",
"refId": "A"
}
]
},
{
"id": 4,
"title": "Token Usage by Model",
"type": "piechart",
"targets": [
{
"expr": "sum by (model) (mcpcodex_tokens_consumed_total)",
"legendFormat": "{{model}}",
"refId": "A"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}
# AI-Powered Alert Rules
# ai_alerts.yml
groups:
- name: mcpcodex.ai.alerts
rules:
# AI Model Performance
- alert: HighAIResponseTime
expr: histogram_quantile(0.95, sum(rate(mcpcodex_ai_request_duration_seconds_bucket[5m])) by (le)) > 10
for: 2m
labels:
severity: warning
service: mcpcodex-ai
annotations:
summary: "AI model response time is high"
description: "95th percentile response time is {{ $value }}s for the last 5 minutes"
runbook_url: "https://docs.mcpcodex.com/runbooks/ai-performance"
- alert: AIModelDown
expr: up{job="mcpcodex-ai-services"} == 0
for: 1m
labels:
severity: critical
service: mcpcodex-ai
annotations:
summary: "AI model service is down"
description: "AI service {{ $labels.instance }} has been down for more than 1 minute"
# Context Management
- alert: ContextMemoryHigh
expr: (mcpcodex_context_memory_usage_bytes / mcpcodex_context_memory_limit_bytes) > 0.8
for: 5m
labels:
severity: warning
service: context-manager
annotations:
summary: "Context memory usage is high"
description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
# Token Usage
- alert: TokenLimitApproaching
expr: (mcpcodex_tokens_used / mcpcodex_tokens_limit) > 0.8
for: 2m
labels:
severity: warning
service: token-manager
annotations:
summary: "Token limit approaching"
description: "Token usage is at {{ $value | humanizePercentage }} of limit"
# Security Alerts
- alert: UnauthorizedAIAccess
expr: increase(mcpcodex_unauthorized_requests_total[5m]) > 10
for: 1m
labels:
severity: critical
service: security
annotations:
summary: "Multiple unauthorized AI access attempts"
description: "{{ $value }} unauthorized requests in the last 5 minutes"
# Performance Degradation
- alert: AIThroughputDrop
expr: (rate(mcpcodex_requests_total[5m]) < rate(mcpcodex_requests_total[15m]) * 0.5)
for: 3m
labels:
severity: warning
service: performance
annotations:
summary: "AI throughput has dropped significantly"
description: "Current throughput is less than 50% of the 15-minute average"
#!/bin/bash
# AI-Powered Observability Setup Script
set -e
NAMESPACE="monitoring"
GRAFANA_VERSION="9.5.2"
PROMETHEUS_VERSION="2.45.0"
echo "🤖 Setting up AI-powered observability stack..."
# Create monitoring namespace
kubectl create namespace $NAMESPACE --dry-run=client -o yaml | kubectl apply -f -
# Deploy Prometheus with AI enhancements
echo "📊 Deploying Prometheus..."
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \
--namespace $NAMESPACE \
--version $PROMETHEUS_VERSION \
--values prometheus-values.yaml \
--set prometheus.prometheusSpec.retention=30d \
--set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi
# Deploy Grafana with AI dashboards
echo "📈 Deploying Grafana..."
helm upgrade --install grafana grafana/grafana \
--namespace $NAMESPACE \
--version $GRAFANA_VERSION \
--set persistence.enabled=true \
--set persistence.size=10Gi \
--set adminPassword=$GRAFANA_ADMIN_PASSWORD
# Install MCPCodex AI monitoring components
echo "🔍 Installing AI monitoring components..."
mcpcodex monitoring install \
--prometheus-endpoint http://prometheus:9090 \
--grafana-endpoint http://grafana:3000 \
--enable-ai-insights \
--auto-dashboard-generation
# Setup AI-powered alerting
echo "🚨 Configuring AI alerts..."
mcpcodex alerts configure \
--smart-thresholds \
--anomaly-detection \
--predictive-alerts \
--slack-webhook $SLACK_WEBHOOK
# Deploy custom AI metrics exporters
echo "📡 Deploying AI metrics exporters..."
kubectl apply -f ai-model-exporter.yaml
kubectl apply -f context-metrics-exporter.yaml
kubectl apply -f token-usage-exporter.yaml
# Install Jaeger for distributed tracing
echo "🔍 Setting up distributed tracing..."
kubectl apply -f https://github.com/jaegertracing/jaeger-operator/releases/download/v1.47.0/jaeger-operator.yaml
kubectl apply -f jaeger-instance.yaml
# Configure log aggregation with AI insights
echo "📝 Setting up log aggregation..."
mcpcodex logs setup \
--provider elastic \
--ai-log-analysis \
--error-prediction \
--performance-insights
echo "✅ Observability stack deployed successfully!"
echo "🎯 Access Grafana: kubectl port-forward svc/grafana 3000:80 -n monitoring"
echo "📊 Access Prometheus: kubectl port-forward svc/prometheus 9090:9090 -n monitoring"
Deploy complete observability stack.
mcpcodex monitoring setup --ai
Setup intelligent alerting rules.
mcpcodex alerts configure --smart
Access real-time AI metrics dashboard.
mcpcodex metrics dashboard
Comprehensive system health analysis.
mcpcodex health check --full
Get complete visibility into your AI applications with intelligent monitoring and alerting.