2:I[7012,["4765","static/chunks/4765-f5afdf8061f456f3.js","9856","static/chunks/9856-3b185291364d9bef.js","6687","static/chunks/app/docs/%5B...slug%5D/page-e07536548216bee4.js"],"MarkdownRenderer"] 4:I[9856,["4765","static/chunks/4765-f5afdf8061f456f3.js","9856","static/chunks/9856-3b185291364d9bef.js","6687","static/chunks/app/docs/%5B...slug%5D/page-e07536548216bee4.js"],""] 5:I[4126,[],""] 7:I[9630,[],""] 8:I[4278,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"HeadingProvider"] 9:I[1476,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"Header"] a:I[3167,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"Sidebar"] b:I[7409,["9856","static/chunks/9856-3b185291364d9bef.js","8172","static/chunks/8172-b3a2d6fe4ae10d40.js","3185","static/chunks/app/layout-2814fa5d15b84fe4.js"],"PageFrame"] 3:T79a4, # Monitoring Runbook **Last Updated**: 2025-11-27 **Purpose**: Comprehensive guide for monitoring and observability in VoiceAssist V2 --- ## Monitoring Architecture ``` Application Metrics ↓ Prometheus (Metrics Collection) ↓ Grafana (Visualization) ↓ AlertManager (Alerting) ↓ PagerDuty/Slack/Email ``` ### Key Monitoring Components | Component | Purpose | Port | Dashboard | | ----------------------- | ---------------------------- | ------------ | ----------------------------- | | **Prometheus** | Metrics collection & storage | 9090 | http://localhost:9090 | | **Grafana** | Metrics visualization | 3000 | http://localhost:3000 | | **AlertManager** | Alert routing & management | 9093 | http://localhost:9093 | | **Application Metrics** | Custom app metrics | 8000/metrics | http://localhost:8000/metrics | --- ## Setup Monitoring Stack ### Docker Compose Configuration ```yaml # Add to docker-compose.yml services: prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml - ./monitoring/alerts.yml:/etc/prometheus/alerts.yml - prometheus_data:/prometheus command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention.time=30d" - "--web.console.libraries=/etc/prometheus/console_libraries" - "--web.console.templates=/etc/prometheus/consoles" grafana: image: grafana/grafana:latest ports: - "3000:3000" volumes: - grafana_data:/var/lib/grafana - ./monitoring/grafana/provisioning:/etc/grafana/provisioning - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin} - GF_USERS_ALLOW_SIGN_UP=false depends_on: - prometheus alertmanager: image: prom/alertmanager:latest ports: - "9093:9093" volumes: - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml - alertmanager_data:/alertmanager command: - "--config.file=/etc/alertmanager/alertmanager.yml" - "--storage.path=/alertmanager" node-exporter: image: prom/node-exporter:latest ports: - "9100:9100" command: - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro postgres-exporter: image: prometheuscommunity/postgres-exporter:latest ports: - "9187:9187" environment: DATA_SOURCE_NAME: "postgresql://voiceassist:${POSTGRES_PASSWORD}@postgres:5432/voiceassist?sslmode=disable" depends_on: - postgres redis-exporter: image: oliver006/redis_exporter:latest ports: - "9121:9121" environment: REDIS_ADDR: "redis:6379" depends_on: - redis volumes: prometheus_data: grafana_data: alertmanager_data: ``` ### Prometheus Configuration ```yaml # Create monitoring/prometheus.yml global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: "voiceassist-prod" environment: "production" # Load alerting rules rule_files: - "/etc/prometheus/alerts.yml" # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: ["alertmanager:9093"] # Scrape configurations scrape_configs: # VoiceAssist Application - job_name: "voiceassist-app" static_configs: - targets: ["voiceassist-server:8000"] metrics_path: "/metrics" scrape_interval: 10s # PostgreSQL - job_name: "postgresql" static_configs: - targets: ["postgres-exporter:9187"] # Redis - job_name: "redis" static_configs: - targets: ["redis-exporter:9121"] # Node metrics - job_name: "node" static_configs: - targets: ["node-exporter:9100"] # Prometheus itself - job_name: "prometheus" static_configs: - targets: ["localhost:9090"] # Grafana - job_name: "grafana" static_configs: - targets: ["grafana:3000"] ``` ### Alert Rules ```yaml # Create monitoring/alerts.yml groups: - name: voiceassist_alerts interval: 30s rules: # Application availability - alert: ApplicationDown expr: up{job="voiceassist-app"} == 0 for: 1m labels: severity: critical component: application annotations: summary: "VoiceAssist application is down" description: "Application {{ $labels.instance }} is not responding" # High error rate - alert: HighErrorRate expr: | rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 5m labels: severity: warning component: application annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanizePercentage }} over last 5 minutes" # Slow response times - alert: SlowResponseTime expr: | histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]) ) > 2 for: 5m labels: severity: warning component: application annotations: summary: "Slow API response times" description: "95th percentile response time is {{ $value }}s" # High CPU usage - alert: HighCPUUsage expr: | 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning component: infrastructure annotations: summary: "High CPU usage" description: "CPU usage is {{ $value }}% on {{ $labels.instance }}" # High memory usage - alert: HighMemoryUsage expr: | (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 10m labels: severity: warning component: infrastructure annotations: summary: "High memory usage" description: "Memory usage is {{ $value }}% on {{ $labels.instance }}" # Database connection pool exhaustion - alert: DatabaseConnectionPoolExhausted expr: | pg_stat_database_numbackends / pg_settings_max_connections > 0.8 for: 5m labels: severity: warning component: database annotations: summary: "Database connection pool nearly exhausted" description: "Database connections at {{ $value | humanizePercentage }} of maximum" # Database down - alert: DatabaseDown expr: up{job="postgresql"} == 0 for: 1m labels: severity: critical component: database annotations: summary: "PostgreSQL database is down" description: "Database {{ $labels.instance }} is not responding" # Redis down - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical component: cache annotations: summary: "Redis is down" description: "Redis {{ $labels.instance }} is not responding" # High Redis memory usage - alert: HighRedisMemory expr: | redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 5m labels: severity: warning component: cache annotations: summary: "Redis memory usage high" description: "Redis memory usage at {{ $value | humanizePercentage }}" # Disk space low - alert: DiskSpaceLow expr: | (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 20 for: 10m labels: severity: warning component: infrastructure annotations: summary: "Low disk space" description: "Only {{ $value }}% disk space remaining on {{ $labels.instance }}" # Certificate expiration - alert: SSLCertificateExpiring expr: | (ssl_certificate_expiry_seconds - time()) / 86400 < 30 for: 1h labels: severity: warning component: infrastructure annotations: summary: "SSL certificate expiring soon" description: "SSL certificate expires in {{ $value }} days" ``` ### AlertManager Configuration ```yaml # Create monitoring/alertmanager.yml global: resolve_timeout: 5m slack_api_url: "${SLACK_WEBHOOK_URL}" # Default route route: receiver: "default" group_by: ["alertname", "cluster", "service"] group_wait: 10s group_interval: 10s repeat_interval: 12h routes: # Critical alerts -> PagerDuty + Slack - match: severity: critical receiver: "pagerduty-critical" continue: true - match: severity: critical receiver: "slack-critical" # Warning alerts -> Slack only - match: severity: warning receiver: "slack-warnings" # Receivers receivers: - name: "default" slack_configs: - channel: "#voiceassist-alerts" title: "VoiceAssist Alert" text: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}' - name: "pagerduty-critical" pagerduty_configs: - service_key: "${PAGERDUTY_SERVICE_KEY}" description: "{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}" - name: "slack-critical" slack_configs: - channel: "#voiceassist-critical" username: "AlertManager" color: "danger" title: "🔴 CRITICAL: {{ .GroupLabels.alertname }}" text: | *Summary:* {{ .CommonAnnotations.summary }} *Description:* {{ .CommonAnnotations.description }} *Severity:* {{ .GroupLabels.severity }} *Component:* {{ .GroupLabels.component }} - name: "slack-warnings" slack_configs: - channel: "#voiceassist-alerts" username: "AlertManager" color: "warning" title: "⚠️ WARNING: {{ .GroupLabels.alertname }}" text: | *Summary:* {{ .CommonAnnotations.summary }} *Description:* {{ .CommonAnnotations.description }} *Severity:* {{ .GroupLabels.severity }} *Component:* {{ .GroupLabels.component }} - name: "email-ops" email_configs: - to: "ops-team@voiceassist.local" from: "alertmanager@voiceassist.local" smarthost: "smtp.gmail.com:587" auth_username: "${SMTP_USERNAME}" auth_password: "${SMTP_PASSWORD}" headers: Subject: "[VoiceAssist] {{ .GroupLabels.alertname }}" ``` ### Deploy Monitoring Stack ```bash # Create monitoring directory mkdir -p /Users/mohammednazmy/VoiceAssist/monitoring/grafana/{provisioning,dashboards} # Start monitoring stack docker compose up -d prometheus grafana alertmanager node-exporter postgres-exporter redis-exporter # Verify services docker compose ps | grep -E "(prometheus|grafana|alertmanager)" # Check Prometheus targets curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}' # Access Grafana echo "Grafana: http://localhost:3000 (admin/admin)" echo "Prometheus: http://localhost:9090" echo "AlertManager: http://localhost:9093" ``` --- ## Grafana Dashboards ### Provision Datasource ```yaml # Create monitoring/grafana/provisioning/datasources/prometheus.yml apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: false ``` ### Provision Dashboards ```yaml # Create monitoring/grafana/provisioning/dashboards/dashboards.yml apiVersion: 1 providers: - name: "VoiceAssist" orgId: 1 folder: "VoiceAssist V2" type: file disableDeletion: false updateIntervalSeconds: 30 allowUiUpdates: true options: path: /var/lib/grafana/dashboards ``` ### Application Overview Dashboard ```json // Create monitoring/grafana/dashboards/application-overview.json { "dashboard": { "title": "VoiceAssist - Application Overview", "tags": ["voiceassist", "application"], "timezone": "browser", "panels": [ { "title": "Request Rate", "type": "graph", "targets": [ { "expr": "rate(http_requests_total{job=\"voiceassist-app\"}[5m])", "legendFormat": "{{method}} {{endpoint}}" } ] }, { "title": "Response Time (p95)", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", "legendFormat": "p95" } ] }, { "title": "Error Rate", "type": "graph", "targets": [ { "expr": "rate(http_requests_total{status=~\"5..\"}[5m])", "legendFormat": "5xx errors" } ] }, { "title": "Active Instances", "type": "stat", "targets": [ { "expr": "count(up{job=\"voiceassist-app\"} == 1)" } ] } ] } } ``` ### Database Dashboard ```json // Create monitoring/grafana/dashboards/database.json { "dashboard": { "title": "VoiceAssist - Database", "tags": ["voiceassist", "database", "postgresql"], "panels": [ { "title": "Database Connections", "type": "graph", "targets": [ { "expr": "pg_stat_database_numbackends", "legendFormat": "Active connections" } ] }, { "title": "Query Duration", "type": "graph", "targets": [ { "expr": "rate(pg_stat_database_tup_fetched[5m])", "legendFormat": "Rows fetched/sec" } ] }, { "title": "Database Size", "type": "graph", "targets": [ { "expr": "pg_database_size_bytes", "legendFormat": "Database size" } ] }, { "title": "Cache Hit Ratio", "type": "gauge", "targets": [ { "expr": "rate(pg_stat_database_blks_hit[5m]) / (rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m]))" } ] } ] } } ``` ### Import Pre-built Dashboards ```bash # Import Node Exporter dashboard curl -X POST http://localhost:3000/api/dashboards/import \ -H "Content-Type: application/json" \ -u admin:admin \ -d '{ "dashboard": { "id": null, "uid": null, "title": "Node Exporter Full", "gnetId": 1860 }, "overwrite": false, "inputs": [ { "name": "DS_PROMETHEUS", "type": "datasource", "pluginId": "prometheus", "value": "Prometheus" } ] }' # Import PostgreSQL dashboard curl -X POST http://localhost:3000/api/dashboards/import \ -H "Content-Type: application/json" \ -u admin:admin \ -d '{ "dashboard": { "id": null, "uid": null, "title": "PostgreSQL Database", "gnetId": 9628 }, "overwrite": false, "inputs": [ { "name": "DS_PROMETHEUS", "type": "datasource", "pluginId": "prometheus", "value": "Prometheus" } ] }' # Import Redis dashboard curl -X POST http://localhost:3000/api/dashboards/import \ -H "Content-Type: application/json" \ -u admin:admin \ -d '{ "dashboard": { "id": null, "uid": null, "title": "Redis Dashboard", "gnetId": 11835 }, "overwrite": false, "inputs": [ { "name": "DS_PROMETHEUS", "type": "datasource", "pluginId": "prometheus", "value": "Prometheus" } ] }' ``` --- ## Application Metrics ### Instrument Application Code ```python # Add to application code (e.g., app/monitoring.py) from prometheus_client import Counter, Histogram, Gauge, generate_latest from fastapi import FastAPI, Response import time app = FastAPI() # Metrics REQUEST_COUNT = Counter( 'http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status'] ) REQUEST_DURATION = Histogram( 'http_request_duration_seconds', 'HTTP request duration in seconds', ['method', 'endpoint'] ) ACTIVE_REQUESTS = Gauge( 'http_requests_active', 'Number of active HTTP requests', ['method', 'endpoint'] ) DB_CONNECTION_POOL = Gauge( 'db_connection_pool_size', 'Database connection pool size', ['state'] # active, idle ) CACHE_OPERATIONS = Counter( 'cache_operations_total', 'Total cache operations', ['operation', 'status'] # get/set, hit/miss ) # Middleware to track metrics @app.middleware("http") async def track_metrics(request, call_next): method = request.method endpoint = request.url.path ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).inc() start_time = time.time() try: response = await call_next(request) status = response.status_code except Exception as e: status = 500 raise finally: duration = time.time() - start_time REQUEST_COUNT.labels( method=method, endpoint=endpoint, status=status ).inc() REQUEST_DURATION.labels( method=method, endpoint=endpoint ).observe(duration) ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).dec() return response # Metrics endpoint @app.get("/metrics") async def metrics(): return Response( content=generate_latest(), media_type="text/plain" ) # Custom metric tracking def track_cache_operation(operation: str, hit: bool): """Track cache hit/miss""" status = "hit" if hit else "miss" CACHE_OPERATIONS.labels(operation=operation, status=status).inc() def update_connection_pool_metrics(active: int, idle: int): """Update database connection pool metrics""" DB_CONNECTION_POOL.labels(state="active").set(active) DB_CONNECTION_POOL.labels(state="idle").set(idle) ``` ### Custom Business Metrics ```python # Track business-specific metrics from prometheus_client import Counter, Gauge # User metrics USER_REGISTRATIONS = Counter( 'user_registrations_total', 'Total user registrations' ) ACTIVE_USERS = Gauge( 'active_users', 'Number of currently active users' ) # Conversation metrics CONVERSATIONS_CREATED = Counter( 'conversations_created_total', 'Total conversations created' ) MESSAGES_SENT = Counter( 'messages_sent_total', 'Total messages sent', ['conversation_type'] ) # Voice processing metrics VOICE_PROCESSING_DURATION = Histogram( 'voice_processing_duration_seconds', 'Voice processing duration in seconds' ) VOICE_PROCESSING_ERRORS = Counter( 'voice_processing_errors_total', 'Total voice processing errors', ['error_type'] ) # Usage in application def create_conversation(user_id: int): CONVERSATIONS_CREATED.inc() # ... rest of the logic def send_message(conversation_id: int, message: str): MESSAGES_SENT.labels(conversation_type="text").inc() # ... rest of the logic def process_voice(audio_data: bytes): start_time = time.time() try: result = process_audio(audio_data) VOICE_PROCESSING_DURATION.observe(time.time() - start_time) return result except Exception as e: VOICE_PROCESSING_ERRORS.labels(error_type=type(e).__name__).inc() raise ``` --- ## Log Aggregation ### Structured Logging ```python # Configure structured logging import logging import json from datetime import datetime class JSONFormatter(logging.Formatter): def format(self, record): log_data = { 'timestamp': datetime.utcnow().isoformat(), 'level': record.levelname, 'logger': record.name, 'message': record.getMessage(), 'module': record.module, 'function': record.funcName, 'line': record.lineno } if record.exc_info: log_data['exception'] = self.formatException(record.exc_info) if hasattr(record, 'user_id'): log_data['user_id'] = record.user_id if hasattr(record, 'request_id'): log_data['request_id'] = record.request_id return json.dumps(log_data) # Configure logger handler = logging.StreamHandler() handler.setFormatter(JSONFormatter()) logger = logging.getLogger('voiceassist') logger.addHandler(handler) logger.setLevel(logging.INFO) # Usage logger.info("User logged in", extra={'user_id': 123}) logger.error("Database connection failed", exc_info=True) ``` ### Centralized Logging with Loki ```yaml # Add to docker-compose.yml services: loki: image: grafana/loki:latest ports: - "3100:3100" volumes: - ./monitoring/loki-config.yml:/etc/loki/local-config.yaml - loki_data:/loki command: -config.file=/etc/loki/local-config.yaml promtail: image: grafana/promtail:latest volumes: - ./monitoring/promtail-config.yml:/etc/promtail/config.yml - /var/lib/docker/containers:/var/lib/docker/containers:ro - /var/run/docker.sock:/var/run/docker.sock command: -config.file=/etc/promtail/config.yml depends_on: - loki volumes: loki_data: ``` ```yaml # Create monitoring/loki-config.yml auth_enabled: false server: http_listen_port: 3100 ingester: lifecycler: address: 127.0.0.1 ring: kvstore: store: inmemory replication_factor: 1 chunk_idle_period: 5m chunk_retain_period: 30s schema_config: configs: - from: 2020-10-24 store: boltdb object_store: filesystem schema: v11 index: prefix: index_ period: 168h storage_config: boltdb: directory: /loki/index filesystem: directory: /loki/chunks limits_config: enforce_metric_name: false reject_old_samples: true reject_old_samples_max_age: 168h chunk_store_config: max_look_back_period: 0s table_manager: retention_deletes_enabled: false retention_period: 0s ``` ```yaml # Create monitoring/promtail-config.yml server: http_listen_port: 9080 grpc_listen_port: 0 positions: filename: /tmp/positions.yaml clients: - url: http://loki:3100/loki/api/v1/push scrape_configs: - job_name: docker docker_sd_configs: - host: unix:///var/run/docker.sock refresh_interval: 5s relabel_configs: - source_labels: ["__meta_docker_container_name"] regex: "/(.*)" target_label: "container" - source_labels: ["__meta_docker_container_log_stream"] target_label: "stream" ``` ```bash # Add Loki datasource to Grafana curl -X POST http://localhost:3000/api/datasources \ -H "Content-Type: application/json" \ -u admin:admin \ -d '{ "name": "Loki", "type": "loki", "url": "http://loki:3100", "access": "proxy", "isDefault": false }' ``` --- ## Health Checks ### Application Health Endpoints ```python # Comprehensive health check endpoints from fastapi import APIRouter, status from typing import Dict import asyncio router = APIRouter() @router.get("/health") async def health_check() -> Dict: """Basic health check - always returns 200 if app is running""" return { "status": "healthy", "timestamp": datetime.utcnow().isoformat(), "version": "2.0.0" } @router.get("/ready") async def readiness_check() -> Dict: """Readiness check - verifies all dependencies""" checks = { "database": await check_database(), "redis": await check_redis(), "qdrant": await check_qdrant() } all_healthy = all(checks.values()) return { "status": "ready" if all_healthy else "not_ready", "timestamp": datetime.utcnow().isoformat(), "checks": checks } async def check_database() -> bool: """Check database connectivity""" try: await db.execute("SELECT 1") return True except Exception: return False async def check_redis() -> bool: """Check Redis connectivity""" try: redis_client.ping() return True except Exception: return False async def check_qdrant() -> bool: """Check Qdrant connectivity""" try: response = await http_client.get("http://qdrant:6333/healthz") return response.status_code == 200 except Exception: return False @router.get("/live") async def liveness_check() -> Dict: """Liveness check - for Kubernetes/Docker""" return {"status": "alive"} ``` ### Docker Health Checks ```yaml # Update docker-compose.yml with health checks services: voiceassist-server: # ... existing config ... healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s postgres: # ... existing config ... healthcheck: test: ["CMD-SHELL", "pg_isready -U voiceassist"] interval: 10s timeout: 5s retries: 5 redis: # ... existing config ... healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 10s timeout: 3s retries: 3 qdrant: # ... existing config ... healthcheck: test: ["CMD", "curl", "-f", "http://localhost:6333/healthz"] interval: 30s timeout: 10s retries: 3 ``` --- ## Monitoring Operations ### Daily Monitoring Routine ```bash #!/bin/bash # Save as: /usr/local/bin/va-monitoring-daily echo "VoiceAssist Daily Monitoring Report - $(date)" echo "==============================================" echo "" # 1. Check all services are up echo "1. Service Health:" docker compose ps | grep -E "(Up|healthy)" | wc -l docker compose ps echo "" # 2. Check Prometheus targets echo "2. Prometheus Targets:" curl -s http://localhost:9090/api/v1/targets | \ jq '.data.activeTargets[] | {job: .labels.job, health: .health}' echo "" # 3. Check for active alerts echo "3. Active Alerts:" curl -s http://localhost:9093/api/v1/alerts | \ jq '.data[] | select(.status.state=="active") | {name: .labels.alertname, severity: .labels.severity}' echo "" # 4. Resource usage summary echo "4. Resource Usage:" docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemPerc}}" | head -10 echo "" # 5. Error rate (last 24 hours) echo "5. Error Rate (24h):" docker compose logs --since 24h voiceassist-server | grep -i error | wc -l echo "" # 6. Database health echo "6. Database Health:" docker compose exec -T postgres psql -U voiceassist -d voiceassist </dev/null | tail -1 echo "" echo "==============================================" echo "Report completed" ``` ### Troubleshooting Monitoring Issues #### Prometheus Not Scraping Targets ```bash # Check Prometheus logs docker compose logs prometheus | tail -50 # Check target configuration curl -s http://localhost:9090/api/v1/targets | jq '.' # Verify network connectivity docker compose exec prometheus wget -O- http://voiceassist-server:8000/metrics # Reload Prometheus configuration curl -X POST http://localhost:9090/-/reload ``` #### Grafana Dashboards Not Loading ```bash # Check Grafana logs docker compose logs grafana | tail -50 # Verify datasource connection curl -s http://localhost:3000/api/datasources \ -u admin:admin | jq '.' # Test Prometheus connection from Grafana curl -s http://localhost:3000/api/datasources/proxy/1/api/v1/query?query=up \ -u admin:admin | jq '.' # Restart Grafana docker compose restart grafana ``` #### Alerts Not Firing ```bash # Check AlertManager status curl -s http://localhost:9093/api/v1/status | jq '.' # Check alert rules in Prometheus curl -s http://localhost:9090/api/v1/rules | jq '.' # Check specific alert state curl -s 'http://localhost:9090/api/v1/query?query=ALERTS{alertname="HighErrorRate"}' | jq '.' # Verify AlertManager configuration docker compose exec alertmanager amtool config show # Check AlertManager logs docker compose logs alertmanager | tail -50 ``` --- ## Monitoring Best Practices ### 1. Define SLOs (Service Level Objectives) ```yaml # Document SLOs SLOs: - name: Availability target: 99.9% measurement: uptime over 30 days - name: Response Time target: p95 < 500ms measurement: 95th percentile of all API requests - name: Error Rate target: < 0.1% measurement: 5xx errors / total requests - name: Data Durability target: 99.999% measurement: no data loss events ``` ### 2. Alert Fatigue Prevention ```yaml # Guidelines for creating alerts: # - Alert on symptoms, not causes # - Make alerts actionable # - Include runbook links # - Set appropriate thresholds # - Use proper severity levels # - Group related alerts # Good alert example: - alert: UserFacingErrorRate expr: rate(http_requests_total{status="500"}[5m]) > 0.05 for: 5m annotations: summary: "High user-facing error rate" description: "More than 5% of requests failing" runbook_url: "https://docs.voiceassist.local/runbooks/troubleshooting#high-error-rate" # Bad alert example (too noisy): - alert: SingleError expr: increase(http_requests_total{status="500"}[1m]) > 0 for: 0s ``` ### 3. Dashboard Organization ``` Dashboards Structure: ├── Executive Dashboard (high-level KPIs) ├── Application Overview (request rate, errors, latency) ├── Infrastructure (CPU, memory, disk, network) ├── Database Performance (connections, queries, cache hit ratio) ├── Cache Performance (Redis operations, memory, hit rate) ├── Business Metrics (users, conversations, messages) └── On-Call Dashboard (active alerts, recent incidents) ``` --- ## Related Documentation - [Incident Response Runbook](./INCIDENT_RESPONSE.md) - [Troubleshooting Runbook](./TROUBLESHOOTING.md) - [Deployment Runbook](./DEPLOYMENT.md) - [Scaling Runbook](./SCALING.md) - [UNIFIED_ARCHITECTURE.md](../../UNIFIED_ARCHITECTURE.md) --- **Document Version**: 1.0 **Last Updated**: 2025-11-21 **Maintained By**: VoiceAssist DevOps Team **Review Cycle**: Quarterly **Next Review**: 2026-02-21 6:["slug","operations/runbooks/MONITORING","c"] 0:["X7oMT3VrOffzp0qvbeOas",[[["",{"children":["docs",{"children":[["slug","operations/runbooks/MONITORING","c"],{"children":["__PAGE__?{\"slug\":[\"operations\",\"runbooks\",\"MONITORING\"]}",{}]}]}]},"$undefined","$undefined",true],["",{"children":["docs",{"children":[["slug","operations/runbooks/MONITORING","c"],{"children":["__PAGE__",{},[["$L1",["$","div",null,{"children":[["$","div",null,{"className":"mb-6 flex items-center justify-between gap-4","children":[["$","div",null,{"children":[["$","p",null,{"className":"text-sm text-gray-500 dark:text-gray-400","children":"Docs / Raw"}],["$","h1",null,{"className":"text-3xl font-bold text-gray-900 dark:text-white","children":"Monitoring Runbook"}],["$","p",null,{"className":"text-sm text-gray-600 dark:text-gray-400","children":["Sourced from"," ",["$","code",null,{"className":"font-mono text-xs","children":["docs/","operations/runbooks/MONITORING.md"]}]]}]]}],["$","a",null,{"href":"https://github.com/mohammednazmy/VoiceAssist/edit/main/docs/operations/runbooks/MONITORING.md","target":"_blank","rel":"noreferrer","className":"inline-flex items-center gap-2 rounded-md border border-gray-200 dark:border-gray-700 px-3 py-1.5 text-sm text-gray-700 dark:text-gray-200 hover:border-primary-500 dark:hover:border-primary-400 hover:text-primary-700 dark:hover:text-primary-300","children":"Edit on GitHub"}]]}],["$","div",null,{"className":"rounded-lg border border-gray-200 dark:border-gray-800 bg-white dark:bg-gray-900 p-6","children":["$","$L2",null,{"content":"$3"}]}],["$","div",null,{"className":"mt-6 flex flex-wrap gap-2 text-sm","children":[["$","$L4",null,{"href":"/reference/all-docs","className":"inline-flex items-center gap-1 rounded-md bg-gray-100 px-3 py-1 text-gray-700 hover:bg-gray-200 dark:bg-gray-800 dark:text-gray-200 dark:hover:bg-gray-700","children":"← All documentation"}],["$","$L4",null,{"href":"/","className":"inline-flex items-center gap-1 rounded-md bg-gray-100 px-3 py-1 text-gray-700 hover:bg-gray-200 dark:bg-gray-800 dark:text-gray-200 dark:hover:bg-gray-700","children":"Home"}]]}]]}],null],null],null]},[null,["$","$L5",null,{"parallelRouterKey":"children","segmentPath":["children","docs","children","$6","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L7",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[null,["$","$L5",null,{"parallelRouterKey":"children","segmentPath":["children","docs","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L7",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/_next/static/css/7f586cdbbaa33ff7.css","precedence":"next","crossOrigin":"$undefined"}]],["$","html",null,{"lang":"en","className":"h-full","children":["$","body",null,{"className":"__className_f367f3 h-full bg-white dark:bg-gray-900","children":[["$","a",null,{"href":"#main-content","className":"skip-to-content","children":"Skip to main content"}],["$","$L8",null,{"children":[["$","$L9",null,{}],["$","$La",null,{}],["$","main",null,{"id":"main-content","className":"lg:pl-64","role":"main","aria-label":"Documentation content","children":["$","$Lb",null,{"children":["$","$L5",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L7",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[]}]}]}]]}]]}]}]],null],null],["$Lc",null]]]] c:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Monitoring Runbook | Docs | VoiceAssist Docs"}],["$","meta","3",{"name":"description","content":"Comprehensive guide for monitoring and observability in VoiceAssist V2."}],["$","meta","4",{"name":"keywords","content":"VoiceAssist,documentation,medical AI,voice assistant,healthcare,HIPAA,API"}],["$","meta","5",{"name":"robots","content":"index, follow"}],["$","meta","6",{"name":"googlebot","content":"index, follow"}],["$","link","7",{"rel":"canonical","href":"https://assistdocs.asimo.io"}],["$","meta","8",{"property":"og:title","content":"VoiceAssist Documentation"}],["$","meta","9",{"property":"og:description","content":"Comprehensive documentation for VoiceAssist - Enterprise Medical AI Assistant"}],["$","meta","10",{"property":"og:url","content":"https://assistdocs.asimo.io"}],["$","meta","11",{"property":"og:site_name","content":"VoiceAssist Docs"}],["$","meta","12",{"property":"og:type","content":"website"}],["$","meta","13",{"name":"twitter:card","content":"summary"}],["$","meta","14",{"name":"twitter:title","content":"VoiceAssist Documentation"}],["$","meta","15",{"name":"twitter:description","content":"Comprehensive documentation for VoiceAssist - Enterprise Medical AI Assistant"}],["$","meta","16",{"name":"next-size-adjust"}]] 1:null