Files
c-relay/examples/deployment/monitoring/monitor-relay.sh

460 lines
12 KiB
Bash
Executable File

#!/bin/bash
# C Nostr Relay - Monitoring Script
# Comprehensive monitoring for event-based configuration relay
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
RELAY_DIR="/opt/c-relay"
SERVICE_NAME="c-relay"
RELAY_PORT="8888"
LOG_FILE="/var/log/relay-monitor.log"
ALERT_EMAIL=""
WEBHOOK_URL=""
CHECK_INTERVAL="60"
MAX_MEMORY_MB="1024"
MAX_DB_SIZE_MB="10240"
MIN_DISK_SPACE_MB="1024"
# Counters for statistics
TOTAL_CHECKS=0
FAILED_CHECKS=0
ALERTS_SENT=0
# Functions
print_step() {
echo -e "${BLUE}[INFO]${NC} $1"
log_message "INFO" "$1"
}
print_success() {
echo -e "${GREEN}[OK]${NC} $1"
log_message "OK" "$1"
}
print_warning() {
echo -e "${YELLOW}[WARN]${NC} $1"
log_message "WARN" "$1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
log_message "ERROR" "$1"
}
log_message() {
local level="$1"
local message="$2"
echo "$(date '+%Y-%m-%d %H:%M:%S') [$level] $message" >> "$LOG_FILE"
}
show_help() {
echo "Usage: $0 [OPTIONS]"
echo
echo "Options:"
echo " -d, --relay-dir DIR Relay directory (default: /opt/c-relay)"
echo " -p, --port PORT Relay port (default: 8888)"
echo " -i, --interval SECONDS Check interval (default: 60)"
echo " -e, --email EMAIL Alert email address"
echo " -w, --webhook URL Webhook URL for alerts"
echo " -m, --max-memory MB Max memory usage alert (default: 1024MB)"
echo " -s, --max-db-size MB Max database size alert (default: 10240MB)"
echo " -f, --min-free-space MB Min disk space alert (default: 1024MB)"
echo " -c, --continuous Run continuously (daemon mode)"
echo " -h, --help Show this help message"
echo
echo "Examples:"
echo " $0 # Single check"
echo " $0 -c -i 30 -e admin@example.com # Continuous monitoring"
echo " $0 -w https://hooks.slack.com/... # Webhook notifications"
}
parse_args() {
CONTINUOUS="false"
while [[ $# -gt 0 ]]; do
case $1 in
-d|--relay-dir)
RELAY_DIR="$2"
shift 2
;;
-p|--port)
RELAY_PORT="$2"
shift 2
;;
-i|--interval)
CHECK_INTERVAL="$2"
shift 2
;;
-e|--email)
ALERT_EMAIL="$2"
shift 2
;;
-w|--webhook)
WEBHOOK_URL="$2"
shift 2
;;
-m|--max-memory)
MAX_MEMORY_MB="$2"
shift 2
;;
-s|--max-db-size)
MAX_DB_SIZE_MB="$2"
shift 2
;;
-f|--min-free-space)
MIN_DISK_SPACE_MB="$2"
shift 2
;;
-c|--continuous)
CONTINUOUS="true"
shift
;;
-h|--help)
show_help
exit 0
;;
*)
print_error "Unknown option: $1"
show_help
exit 1
;;
esac
done
}
check_process_running() {
print_step "Checking if relay process is running..."
if pgrep -f "c_relay_x86" > /dev/null; then
print_success "Relay process is running"
return 0
else
print_error "Relay process is not running"
return 1
fi
}
check_port_listening() {
print_step "Checking if port $RELAY_PORT is listening..."
if netstat -tln 2>/dev/null | grep -q ":$RELAY_PORT " || \
ss -tln 2>/dev/null | grep -q ":$RELAY_PORT "; then
print_success "Port $RELAY_PORT is listening"
return 0
else
print_error "Port $RELAY_PORT is not listening"
return 1
fi
}
check_service_status() {
print_step "Checking systemd service status..."
if systemctl is-active --quiet "$SERVICE_NAME"; then
print_success "Service $SERVICE_NAME is active"
return 0
else
local status=$(systemctl is-active "$SERVICE_NAME" 2>/dev/null || echo "unknown")
print_error "Service $SERVICE_NAME status: $status"
return 1
fi
}
check_memory_usage() {
print_step "Checking memory usage..."
local memory_kb=$(ps aux | grep "c_relay_x86" | grep -v grep | awk '{sum+=$6} END {print sum}')
if [[ -z "$memory_kb" ]]; then
print_warning "Could not determine memory usage"
return 1
fi
local memory_mb=$((memory_kb / 1024))
if [[ $memory_mb -gt $MAX_MEMORY_MB ]]; then
print_error "High memory usage: ${memory_mb}MB (limit: ${MAX_MEMORY_MB}MB)"
return 1
else
print_success "Memory usage: ${memory_mb}MB"
return 0
fi
}
check_database_size() {
print_step "Checking database size..."
local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null))
if [[ ${#db_files[@]} -eq 0 ]]; then
print_warning "No database files found"
return 1
fi
local total_size=0
for db_file in "${db_files[@]}"; do
if [[ -r "$db_file" ]]; then
local size_kb=$(du -k "$db_file" | cut -f1)
total_size=$((total_size + size_kb))
fi
done
local total_size_mb=$((total_size / 1024))
if [[ $total_size_mb -gt $MAX_DB_SIZE_MB ]]; then
print_error "Large database size: ${total_size_mb}MB (limit: ${MAX_DB_SIZE_MB}MB)"
return 1
else
print_success "Database size: ${total_size_mb}MB"
return 0
fi
}
check_disk_space() {
print_step "Checking disk space..."
local free_space_kb=$(df "$RELAY_DIR" | awk 'NR==2 {print $4}')
local free_space_mb=$((free_space_kb / 1024))
if [[ $free_space_mb -lt $MIN_DISK_SPACE_MB ]]; then
print_error "Low disk space: ${free_space_mb}MB (minimum: ${MIN_DISK_SPACE_MB}MB)"
return 1
else
print_success "Free disk space: ${free_space_mb}MB"
return 0
fi
}
check_database_integrity() {
print_step "Checking database integrity..."
local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null))
if [[ ${#db_files[@]} -eq 0 ]]; then
print_warning "No database files to check"
return 1
fi
local integrity_ok=true
for db_file in "${db_files[@]}"; do
if [[ -r "$db_file" ]]; then
if timeout 30 sqlite3 "$db_file" "PRAGMA integrity_check;" | grep -q "ok"; then
print_success "Database integrity OK: $(basename "$db_file")"
else
print_error "Database integrity failed: $(basename "$db_file")"
integrity_ok=false
fi
fi
done
if $integrity_ok; then
return 0
else
return 1
fi
}
check_websocket_connection() {
print_step "Checking WebSocket connection..."
# Simple connection test using curl
if timeout 10 curl -s -N -H "Connection: Upgrade" \
-H "Upgrade: websocket" -H "Sec-WebSocket-Key: test" \
-H "Sec-WebSocket-Version: 13" \
"http://localhost:$RELAY_PORT/" >/dev/null 2>&1; then
print_success "WebSocket connection test passed"
return 0
else
print_warning "WebSocket connection test failed (may be normal)"
return 1
fi
}
check_configuration_events() {
print_step "Checking configuration events..."
local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null))
if [[ ${#db_files[@]} -eq 0 ]]; then
print_warning "No database files found"
return 1
fi
local config_count=0
for db_file in "${db_files[@]}"; do
if [[ -r "$db_file" ]]; then
local count=$(sqlite3 "$db_file" "SELECT COUNT(*) FROM events WHERE kind = 33334;" 2>/dev/null || echo "0")
config_count=$((config_count + count))
fi
done
if [[ $config_count -gt 0 ]]; then
print_success "Configuration events found: $config_count"
return 0
else
print_warning "No configuration events found"
return 1
fi
}
send_alert() {
local subject="$1"
local message="$2"
local severity="$3"
ALERTS_SENT=$((ALERTS_SENT + 1))
# Email alert
if [[ -n "$ALERT_EMAIL" ]] && command -v mail >/dev/null 2>&1; then
echo -e "$message" | mail -s "$subject" "$ALERT_EMAIL"
print_step "Alert sent to $ALERT_EMAIL"
fi
# Webhook alert
if [[ -n "$WEBHOOK_URL" ]] && command -v curl >/dev/null 2>&1; then
local webhook_data="{\"text\":\"$subject\",\"attachments\":[{\"color\":\"$severity\",\"text\":\"$message\"}]}"
curl -X POST -H 'Content-type: application/json' \
--data "$webhook_data" "$WEBHOOK_URL" >/dev/null 2>&1
print_step "Alert sent to webhook"
fi
}
restart_service() {
print_step "Attempting to restart service..."
if systemctl restart "$SERVICE_NAME"; then
print_success "Service restarted successfully"
sleep 5 # Wait for service to stabilize
return 0
else
print_error "Failed to restart service"
return 1
fi
}
run_checks() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local failed_checks=0
local total_checks=8
echo
echo "🔍 Relay Health Check - $timestamp"
echo "=================================="
# Core functionality checks
check_process_running || ((failed_checks++))
check_service_status || ((failed_checks++))
check_port_listening || ((failed_checks++))
# Resource checks
check_memory_usage || ((failed_checks++))
check_disk_space || ((failed_checks++))
check_database_size || ((failed_checks++))
# Database checks
check_database_integrity || ((failed_checks++))
check_configuration_events || ((failed_checks++))
# Optional checks
check_websocket_connection # Don't count this as critical
TOTAL_CHECKS=$((TOTAL_CHECKS + total_checks))
FAILED_CHECKS=$((FAILED_CHECKS + failed_checks))
# Summary
echo
if [[ $failed_checks -eq 0 ]]; then
print_success "All checks passed ($total_checks/$total_checks)"
return 0
else
print_error "Failed checks: $failed_checks/$total_checks"
# Send alert if configured
if [[ -n "$ALERT_EMAIL" || -n "$WEBHOOK_URL" ]]; then
local alert_subject="C Nostr Relay Health Alert"
local alert_message="Relay health check failed.
Failed checks: $failed_checks/$total_checks
Time: $timestamp
Host: $(hostname)
Service: $SERVICE_NAME
Port: $RELAY_PORT
Please check the relay logs:
sudo journalctl -u $SERVICE_NAME --since '10 minutes ago'
"
send_alert "$alert_subject" "$alert_message" "danger"
fi
# Auto-restart if service is down
if ! check_process_running >/dev/null 2>&1; then
print_step "Process is down, attempting restart..."
restart_service
fi
return 1
fi
}
show_statistics() {
if [[ $TOTAL_CHECKS -gt 0 ]]; then
local success_rate=$(( (TOTAL_CHECKS - FAILED_CHECKS) * 100 / TOTAL_CHECKS ))
echo
echo "📊 Monitoring Statistics"
echo "======================="
echo "Total Checks: $TOTAL_CHECKS"
echo "Failed Checks: $FAILED_CHECKS"
echo "Success Rate: ${success_rate}%"
echo "Alerts Sent: $ALERTS_SENT"
fi
}
cleanup() {
echo
print_step "Monitoring stopped"
show_statistics
exit 0
}
# Main execution
main() {
echo
echo "📡 C Nostr Relay - Health Monitor"
echo "================================="
echo
# Initialize log file
mkdir -p "$(dirname "$LOG_FILE")"
touch "$LOG_FILE"
parse_args "$@"
# Trap signals for cleanup
trap cleanup SIGINT SIGTERM
if [[ "$CONTINUOUS" == "true" ]]; then
print_step "Starting continuous monitoring (interval: ${CHECK_INTERVAL}s)"
print_step "Press Ctrl+C to stop"
while true; do
run_checks
sleep "$CHECK_INTERVAL"
done
else
run_checks
fi
show_statistics
}
# Run main function
main "$@"