#!/bin/bash # C Nostr Relay - Monitoring Script # Comprehensive monitoring for event-based configuration relay set -e # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Configuration RELAY_DIR="/opt/c-relay" SERVICE_NAME="c-relay" RELAY_PORT="8888" LOG_FILE="/var/log/relay-monitor.log" ALERT_EMAIL="" WEBHOOK_URL="" CHECK_INTERVAL="60" MAX_MEMORY_MB="1024" MAX_DB_SIZE_MB="10240" MIN_DISK_SPACE_MB="1024" # Counters for statistics TOTAL_CHECKS=0 FAILED_CHECKS=0 ALERTS_SENT=0 # Functions print_step() { echo -e "${BLUE}[INFO]${NC} $1" log_message "INFO" "$1" } print_success() { echo -e "${GREEN}[OK]${NC} $1" log_message "OK" "$1" } print_warning() { echo -e "${YELLOW}[WARN]${NC} $1" log_message "WARN" "$1" } print_error() { echo -e "${RED}[ERROR]${NC} $1" log_message "ERROR" "$1" } log_message() { local level="$1" local message="$2" echo "$(date '+%Y-%m-%d %H:%M:%S') [$level] $message" >> "$LOG_FILE" } show_help() { echo "Usage: $0 [OPTIONS]" echo echo "Options:" echo " -d, --relay-dir DIR Relay directory (default: /opt/c-relay)" echo " -p, --port PORT Relay port (default: 8888)" echo " -i, --interval SECONDS Check interval (default: 60)" echo " -e, --email EMAIL Alert email address" echo " -w, --webhook URL Webhook URL for alerts" echo " -m, --max-memory MB Max memory usage alert (default: 1024MB)" echo " -s, --max-db-size MB Max database size alert (default: 10240MB)" echo " -f, --min-free-space MB Min disk space alert (default: 1024MB)" echo " -c, --continuous Run continuously (daemon mode)" echo " -h, --help Show this help message" echo echo "Examples:" echo " $0 # Single check" echo " $0 -c -i 30 -e admin@example.com # Continuous monitoring" echo " $0 -w https://hooks.slack.com/... # Webhook notifications" } parse_args() { CONTINUOUS="false" while [[ $# -gt 0 ]]; do case $1 in -d|--relay-dir) RELAY_DIR="$2" shift 2 ;; -p|--port) RELAY_PORT="$2" shift 2 ;; -i|--interval) CHECK_INTERVAL="$2" shift 2 ;; -e|--email) ALERT_EMAIL="$2" shift 2 ;; -w|--webhook) WEBHOOK_URL="$2" shift 2 ;; -m|--max-memory) MAX_MEMORY_MB="$2" shift 2 ;; -s|--max-db-size) MAX_DB_SIZE_MB="$2" shift 2 ;; -f|--min-free-space) MIN_DISK_SPACE_MB="$2" shift 2 ;; -c|--continuous) CONTINUOUS="true" shift ;; -h|--help) show_help exit 0 ;; *) print_error "Unknown option: $1" show_help exit 1 ;; esac done } check_process_running() { print_step "Checking if relay process is running..." if pgrep -f "c_relay_x86" > /dev/null; then print_success "Relay process is running" return 0 else print_error "Relay process is not running" return 1 fi } check_port_listening() { print_step "Checking if port $RELAY_PORT is listening..." if netstat -tln 2>/dev/null | grep -q ":$RELAY_PORT " || \ ss -tln 2>/dev/null | grep -q ":$RELAY_PORT "; then print_success "Port $RELAY_PORT is listening" return 0 else print_error "Port $RELAY_PORT is not listening" return 1 fi } check_service_status() { print_step "Checking systemd service status..." if systemctl is-active --quiet "$SERVICE_NAME"; then print_success "Service $SERVICE_NAME is active" return 0 else local status=$(systemctl is-active "$SERVICE_NAME" 2>/dev/null || echo "unknown") print_error "Service $SERVICE_NAME status: $status" return 1 fi } check_memory_usage() { print_step "Checking memory usage..." local memory_kb=$(ps aux | grep "c_relay_x86" | grep -v grep | awk '{sum+=$6} END {print sum}') if [[ -z "$memory_kb" ]]; then print_warning "Could not determine memory usage" return 1 fi local memory_mb=$((memory_kb / 1024)) if [[ $memory_mb -gt $MAX_MEMORY_MB ]]; then print_error "High memory usage: ${memory_mb}MB (limit: ${MAX_MEMORY_MB}MB)" return 1 else print_success "Memory usage: ${memory_mb}MB" return 0 fi } check_database_size() { print_step "Checking database size..." local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null)) if [[ ${#db_files[@]} -eq 0 ]]; then print_warning "No database files found" return 1 fi local total_size=0 for db_file in "${db_files[@]}"; do if [[ -r "$db_file" ]]; then local size_kb=$(du -k "$db_file" | cut -f1) total_size=$((total_size + size_kb)) fi done local total_size_mb=$((total_size / 1024)) if [[ $total_size_mb -gt $MAX_DB_SIZE_MB ]]; then print_error "Large database size: ${total_size_mb}MB (limit: ${MAX_DB_SIZE_MB}MB)" return 1 else print_success "Database size: ${total_size_mb}MB" return 0 fi } check_disk_space() { print_step "Checking disk space..." local free_space_kb=$(df "$RELAY_DIR" | awk 'NR==2 {print $4}') local free_space_mb=$((free_space_kb / 1024)) if [[ $free_space_mb -lt $MIN_DISK_SPACE_MB ]]; then print_error "Low disk space: ${free_space_mb}MB (minimum: ${MIN_DISK_SPACE_MB}MB)" return 1 else print_success "Free disk space: ${free_space_mb}MB" return 0 fi } check_database_integrity() { print_step "Checking database integrity..." local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null)) if [[ ${#db_files[@]} -eq 0 ]]; then print_warning "No database files to check" return 1 fi local integrity_ok=true for db_file in "${db_files[@]}"; do if [[ -r "$db_file" ]]; then if timeout 30 sqlite3 "$db_file" "PRAGMA integrity_check;" | grep -q "ok"; then print_success "Database integrity OK: $(basename "$db_file")" else print_error "Database integrity failed: $(basename "$db_file")" integrity_ok=false fi fi done if $integrity_ok; then return 0 else return 1 fi } check_websocket_connection() { print_step "Checking WebSocket connection..." # Simple connection test using curl if timeout 10 curl -s -N -H "Connection: Upgrade" \ -H "Upgrade: websocket" -H "Sec-WebSocket-Key: test" \ -H "Sec-WebSocket-Version: 13" \ "http://localhost:$RELAY_PORT/" >/dev/null 2>&1; then print_success "WebSocket connection test passed" return 0 else print_warning "WebSocket connection test failed (may be normal)" return 1 fi } check_configuration_events() { print_step "Checking configuration events..." local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null)) if [[ ${#db_files[@]} -eq 0 ]]; then print_warning "No database files found" return 1 fi local config_count=0 for db_file in "${db_files[@]}"; do if [[ -r "$db_file" ]]; then local count=$(sqlite3 "$db_file" "SELECT COUNT(*) FROM events WHERE kind = 33334;" 2>/dev/null || echo "0") config_count=$((config_count + count)) fi done if [[ $config_count -gt 0 ]]; then print_success "Configuration events found: $config_count" return 0 else print_warning "No configuration events found" return 1 fi } send_alert() { local subject="$1" local message="$2" local severity="$3" ALERTS_SENT=$((ALERTS_SENT + 1)) # Email alert if [[ -n "$ALERT_EMAIL" ]] && command -v mail >/dev/null 2>&1; then echo -e "$message" | mail -s "$subject" "$ALERT_EMAIL" print_step "Alert sent to $ALERT_EMAIL" fi # Webhook alert if [[ -n "$WEBHOOK_URL" ]] && command -v curl >/dev/null 2>&1; then local webhook_data="{\"text\":\"$subject\",\"attachments\":[{\"color\":\"$severity\",\"text\":\"$message\"}]}" curl -X POST -H 'Content-type: application/json' \ --data "$webhook_data" "$WEBHOOK_URL" >/dev/null 2>&1 print_step "Alert sent to webhook" fi } restart_service() { print_step "Attempting to restart service..." if systemctl restart "$SERVICE_NAME"; then print_success "Service restarted successfully" sleep 5 # Wait for service to stabilize return 0 else print_error "Failed to restart service" return 1 fi } run_checks() { local timestamp=$(date '+%Y-%m-%d %H:%M:%S') local failed_checks=0 local total_checks=8 echo echo "🔍 Relay Health Check - $timestamp" echo "==================================" # Core functionality checks check_process_running || ((failed_checks++)) check_service_status || ((failed_checks++)) check_port_listening || ((failed_checks++)) # Resource checks check_memory_usage || ((failed_checks++)) check_disk_space || ((failed_checks++)) check_database_size || ((failed_checks++)) # Database checks check_database_integrity || ((failed_checks++)) check_configuration_events || ((failed_checks++)) # Optional checks check_websocket_connection # Don't count this as critical TOTAL_CHECKS=$((TOTAL_CHECKS + total_checks)) FAILED_CHECKS=$((FAILED_CHECKS + failed_checks)) # Summary echo if [[ $failed_checks -eq 0 ]]; then print_success "All checks passed ($total_checks/$total_checks)" return 0 else print_error "Failed checks: $failed_checks/$total_checks" # Send alert if configured if [[ -n "$ALERT_EMAIL" || -n "$WEBHOOK_URL" ]]; then local alert_subject="C Nostr Relay Health Alert" local alert_message="Relay health check failed. Failed checks: $failed_checks/$total_checks Time: $timestamp Host: $(hostname) Service: $SERVICE_NAME Port: $RELAY_PORT Please check the relay logs: sudo journalctl -u $SERVICE_NAME --since '10 minutes ago' " send_alert "$alert_subject" "$alert_message" "danger" fi # Auto-restart if service is down if ! check_process_running >/dev/null 2>&1; then print_step "Process is down, attempting restart..." restart_service fi return 1 fi } show_statistics() { if [[ $TOTAL_CHECKS -gt 0 ]]; then local success_rate=$(( (TOTAL_CHECKS - FAILED_CHECKS) * 100 / TOTAL_CHECKS )) echo echo "📊 Monitoring Statistics" echo "=======================" echo "Total Checks: $TOTAL_CHECKS" echo "Failed Checks: $FAILED_CHECKS" echo "Success Rate: ${success_rate}%" echo "Alerts Sent: $ALERTS_SENT" fi } cleanup() { echo print_step "Monitoring stopped" show_statistics exit 0 } # Main execution main() { echo echo "📡 C Nostr Relay - Health Monitor" echo "=================================" echo # Initialize log file mkdir -p "$(dirname "$LOG_FILE")" touch "$LOG_FILE" parse_args "$@" # Trap signals for cleanup trap cleanup SIGINT SIGTERM if [[ "$CONTINUOUS" == "true" ]]; then print_step "Starting continuous monitoring (interval: ${CHECK_INTERVAL}s)" print_step "Press Ctrl+C to stop" while true; do run_checks sleep "$CHECK_INTERVAL" done else run_checks fi show_statistics } # Run main function main "$@"