460 lines
12 KiB
Bash
Executable File
460 lines
12 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# C Nostr Relay - Monitoring Script
|
|
# Comprehensive monitoring for event-based configuration relay
|
|
|
|
set -e
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Configuration
|
|
RELAY_DIR="/opt/c-relay"
|
|
SERVICE_NAME="c-relay"
|
|
RELAY_PORT="8888"
|
|
LOG_FILE="/var/log/relay-monitor.log"
|
|
ALERT_EMAIL=""
|
|
WEBHOOK_URL=""
|
|
CHECK_INTERVAL="60"
|
|
MAX_MEMORY_MB="1024"
|
|
MAX_DB_SIZE_MB="10240"
|
|
MIN_DISK_SPACE_MB="1024"
|
|
|
|
# Counters for statistics
|
|
TOTAL_CHECKS=0
|
|
FAILED_CHECKS=0
|
|
ALERTS_SENT=0
|
|
|
|
# Functions
|
|
print_step() {
|
|
echo -e "${BLUE}[INFO]${NC} $1"
|
|
log_message "INFO" "$1"
|
|
}
|
|
|
|
print_success() {
|
|
echo -e "${GREEN}[OK]${NC} $1"
|
|
log_message "OK" "$1"
|
|
}
|
|
|
|
print_warning() {
|
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
|
log_message "WARN" "$1"
|
|
}
|
|
|
|
print_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
log_message "ERROR" "$1"
|
|
}
|
|
|
|
log_message() {
|
|
local level="$1"
|
|
local message="$2"
|
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [$level] $message" >> "$LOG_FILE"
|
|
}
|
|
|
|
show_help() {
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo
|
|
echo "Options:"
|
|
echo " -d, --relay-dir DIR Relay directory (default: /opt/c-relay)"
|
|
echo " -p, --port PORT Relay port (default: 8888)"
|
|
echo " -i, --interval SECONDS Check interval (default: 60)"
|
|
echo " -e, --email EMAIL Alert email address"
|
|
echo " -w, --webhook URL Webhook URL for alerts"
|
|
echo " -m, --max-memory MB Max memory usage alert (default: 1024MB)"
|
|
echo " -s, --max-db-size MB Max database size alert (default: 10240MB)"
|
|
echo " -f, --min-free-space MB Min disk space alert (default: 1024MB)"
|
|
echo " -c, --continuous Run continuously (daemon mode)"
|
|
echo " -h, --help Show this help message"
|
|
echo
|
|
echo "Examples:"
|
|
echo " $0 # Single check"
|
|
echo " $0 -c -i 30 -e admin@example.com # Continuous monitoring"
|
|
echo " $0 -w https://hooks.slack.com/... # Webhook notifications"
|
|
}
|
|
|
|
parse_args() {
|
|
CONTINUOUS="false"
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-d|--relay-dir)
|
|
RELAY_DIR="$2"
|
|
shift 2
|
|
;;
|
|
-p|--port)
|
|
RELAY_PORT="$2"
|
|
shift 2
|
|
;;
|
|
-i|--interval)
|
|
CHECK_INTERVAL="$2"
|
|
shift 2
|
|
;;
|
|
-e|--email)
|
|
ALERT_EMAIL="$2"
|
|
shift 2
|
|
;;
|
|
-w|--webhook)
|
|
WEBHOOK_URL="$2"
|
|
shift 2
|
|
;;
|
|
-m|--max-memory)
|
|
MAX_MEMORY_MB="$2"
|
|
shift 2
|
|
;;
|
|
-s|--max-db-size)
|
|
MAX_DB_SIZE_MB="$2"
|
|
shift 2
|
|
;;
|
|
-f|--min-free-space)
|
|
MIN_DISK_SPACE_MB="$2"
|
|
shift 2
|
|
;;
|
|
-c|--continuous)
|
|
CONTINUOUS="true"
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
show_help
|
|
exit 0
|
|
;;
|
|
*)
|
|
print_error "Unknown option: $1"
|
|
show_help
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
check_process_running() {
|
|
print_step "Checking if relay process is running..."
|
|
|
|
if pgrep -f "c_relay_x86" > /dev/null; then
|
|
print_success "Relay process is running"
|
|
return 0
|
|
else
|
|
print_error "Relay process is not running"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_port_listening() {
|
|
print_step "Checking if port $RELAY_PORT is listening..."
|
|
|
|
if netstat -tln 2>/dev/null | grep -q ":$RELAY_PORT " || \
|
|
ss -tln 2>/dev/null | grep -q ":$RELAY_PORT "; then
|
|
print_success "Port $RELAY_PORT is listening"
|
|
return 0
|
|
else
|
|
print_error "Port $RELAY_PORT is not listening"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_service_status() {
|
|
print_step "Checking systemd service status..."
|
|
|
|
if systemctl is-active --quiet "$SERVICE_NAME"; then
|
|
print_success "Service $SERVICE_NAME is active"
|
|
return 0
|
|
else
|
|
local status=$(systemctl is-active "$SERVICE_NAME" 2>/dev/null || echo "unknown")
|
|
print_error "Service $SERVICE_NAME status: $status"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_memory_usage() {
|
|
print_step "Checking memory usage..."
|
|
|
|
local memory_kb=$(ps aux | grep "c_relay_x86" | grep -v grep | awk '{sum+=$6} END {print sum}')
|
|
|
|
if [[ -z "$memory_kb" ]]; then
|
|
print_warning "Could not determine memory usage"
|
|
return 1
|
|
fi
|
|
|
|
local memory_mb=$((memory_kb / 1024))
|
|
|
|
if [[ $memory_mb -gt $MAX_MEMORY_MB ]]; then
|
|
print_error "High memory usage: ${memory_mb}MB (limit: ${MAX_MEMORY_MB}MB)"
|
|
return 1
|
|
else
|
|
print_success "Memory usage: ${memory_mb}MB"
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
check_database_size() {
|
|
print_step "Checking database size..."
|
|
|
|
local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null))
|
|
|
|
if [[ ${#db_files[@]} -eq 0 ]]; then
|
|
print_warning "No database files found"
|
|
return 1
|
|
fi
|
|
|
|
local total_size=0
|
|
for db_file in "${db_files[@]}"; do
|
|
if [[ -r "$db_file" ]]; then
|
|
local size_kb=$(du -k "$db_file" | cut -f1)
|
|
total_size=$((total_size + size_kb))
|
|
fi
|
|
done
|
|
|
|
local total_size_mb=$((total_size / 1024))
|
|
|
|
if [[ $total_size_mb -gt $MAX_DB_SIZE_MB ]]; then
|
|
print_error "Large database size: ${total_size_mb}MB (limit: ${MAX_DB_SIZE_MB}MB)"
|
|
return 1
|
|
else
|
|
print_success "Database size: ${total_size_mb}MB"
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
check_disk_space() {
|
|
print_step "Checking disk space..."
|
|
|
|
local free_space_kb=$(df "$RELAY_DIR" | awk 'NR==2 {print $4}')
|
|
local free_space_mb=$((free_space_kb / 1024))
|
|
|
|
if [[ $free_space_mb -lt $MIN_DISK_SPACE_MB ]]; then
|
|
print_error "Low disk space: ${free_space_mb}MB (minimum: ${MIN_DISK_SPACE_MB}MB)"
|
|
return 1
|
|
else
|
|
print_success "Free disk space: ${free_space_mb}MB"
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
check_database_integrity() {
|
|
print_step "Checking database integrity..."
|
|
|
|
local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null))
|
|
|
|
if [[ ${#db_files[@]} -eq 0 ]]; then
|
|
print_warning "No database files to check"
|
|
return 1
|
|
fi
|
|
|
|
local integrity_ok=true
|
|
for db_file in "${db_files[@]}"; do
|
|
if [[ -r "$db_file" ]]; then
|
|
if timeout 30 sqlite3 "$db_file" "PRAGMA integrity_check;" | grep -q "ok"; then
|
|
print_success "Database integrity OK: $(basename "$db_file")"
|
|
else
|
|
print_error "Database integrity failed: $(basename "$db_file")"
|
|
integrity_ok=false
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if $integrity_ok; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_websocket_connection() {
|
|
print_step "Checking WebSocket connection..."
|
|
|
|
# Simple connection test using curl
|
|
if timeout 10 curl -s -N -H "Connection: Upgrade" \
|
|
-H "Upgrade: websocket" -H "Sec-WebSocket-Key: test" \
|
|
-H "Sec-WebSocket-Version: 13" \
|
|
"http://localhost:$RELAY_PORT/" >/dev/null 2>&1; then
|
|
print_success "WebSocket connection test passed"
|
|
return 0
|
|
else
|
|
print_warning "WebSocket connection test failed (may be normal)"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_configuration_events() {
|
|
print_step "Checking configuration events..."
|
|
|
|
local db_files=($(find "$RELAY_DIR" -name "*.nrdb" 2>/dev/null))
|
|
|
|
if [[ ${#db_files[@]} -eq 0 ]]; then
|
|
print_warning "No database files found"
|
|
return 1
|
|
fi
|
|
|
|
local config_count=0
|
|
for db_file in "${db_files[@]}"; do
|
|
if [[ -r "$db_file" ]]; then
|
|
local count=$(sqlite3 "$db_file" "SELECT COUNT(*) FROM events WHERE kind = 33334;" 2>/dev/null || echo "0")
|
|
config_count=$((config_count + count))
|
|
fi
|
|
done
|
|
|
|
if [[ $config_count -gt 0 ]]; then
|
|
print_success "Configuration events found: $config_count"
|
|
return 0
|
|
else
|
|
print_warning "No configuration events found"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
send_alert() {
|
|
local subject="$1"
|
|
local message="$2"
|
|
local severity="$3"
|
|
|
|
ALERTS_SENT=$((ALERTS_SENT + 1))
|
|
|
|
# Email alert
|
|
if [[ -n "$ALERT_EMAIL" ]] && command -v mail >/dev/null 2>&1; then
|
|
echo -e "$message" | mail -s "$subject" "$ALERT_EMAIL"
|
|
print_step "Alert sent to $ALERT_EMAIL"
|
|
fi
|
|
|
|
# Webhook alert
|
|
if [[ -n "$WEBHOOK_URL" ]] && command -v curl >/dev/null 2>&1; then
|
|
local webhook_data="{\"text\":\"$subject\",\"attachments\":[{\"color\":\"$severity\",\"text\":\"$message\"}]}"
|
|
curl -X POST -H 'Content-type: application/json' \
|
|
--data "$webhook_data" "$WEBHOOK_URL" >/dev/null 2>&1
|
|
print_step "Alert sent to webhook"
|
|
fi
|
|
}
|
|
|
|
restart_service() {
|
|
print_step "Attempting to restart service..."
|
|
|
|
if systemctl restart "$SERVICE_NAME"; then
|
|
print_success "Service restarted successfully"
|
|
sleep 5 # Wait for service to stabilize
|
|
return 0
|
|
else
|
|
print_error "Failed to restart service"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
run_checks() {
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
local failed_checks=0
|
|
local total_checks=8
|
|
|
|
echo
|
|
echo "🔍 Relay Health Check - $timestamp"
|
|
echo "=================================="
|
|
|
|
# Core functionality checks
|
|
check_process_running || ((failed_checks++))
|
|
check_service_status || ((failed_checks++))
|
|
check_port_listening || ((failed_checks++))
|
|
|
|
# Resource checks
|
|
check_memory_usage || ((failed_checks++))
|
|
check_disk_space || ((failed_checks++))
|
|
check_database_size || ((failed_checks++))
|
|
|
|
# Database checks
|
|
check_database_integrity || ((failed_checks++))
|
|
check_configuration_events || ((failed_checks++))
|
|
|
|
# Optional checks
|
|
check_websocket_connection # Don't count this as critical
|
|
|
|
TOTAL_CHECKS=$((TOTAL_CHECKS + total_checks))
|
|
FAILED_CHECKS=$((FAILED_CHECKS + failed_checks))
|
|
|
|
# Summary
|
|
echo
|
|
if [[ $failed_checks -eq 0 ]]; then
|
|
print_success "All checks passed ($total_checks/$total_checks)"
|
|
return 0
|
|
else
|
|
print_error "Failed checks: $failed_checks/$total_checks"
|
|
|
|
# Send alert if configured
|
|
if [[ -n "$ALERT_EMAIL" || -n "$WEBHOOK_URL" ]]; then
|
|
local alert_subject="C Nostr Relay Health Alert"
|
|
local alert_message="Relay health check failed.
|
|
|
|
Failed checks: $failed_checks/$total_checks
|
|
Time: $timestamp
|
|
Host: $(hostname)
|
|
Service: $SERVICE_NAME
|
|
Port: $RELAY_PORT
|
|
|
|
Please check the relay logs:
|
|
sudo journalctl -u $SERVICE_NAME --since '10 minutes ago'
|
|
"
|
|
send_alert "$alert_subject" "$alert_message" "danger"
|
|
fi
|
|
|
|
# Auto-restart if service is down
|
|
if ! check_process_running >/dev/null 2>&1; then
|
|
print_step "Process is down, attempting restart..."
|
|
restart_service
|
|
fi
|
|
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
show_statistics() {
|
|
if [[ $TOTAL_CHECKS -gt 0 ]]; then
|
|
local success_rate=$(( (TOTAL_CHECKS - FAILED_CHECKS) * 100 / TOTAL_CHECKS ))
|
|
echo
|
|
echo "📊 Monitoring Statistics"
|
|
echo "======================="
|
|
echo "Total Checks: $TOTAL_CHECKS"
|
|
echo "Failed Checks: $FAILED_CHECKS"
|
|
echo "Success Rate: ${success_rate}%"
|
|
echo "Alerts Sent: $ALERTS_SENT"
|
|
fi
|
|
}
|
|
|
|
cleanup() {
|
|
echo
|
|
print_step "Monitoring stopped"
|
|
show_statistics
|
|
exit 0
|
|
}
|
|
|
|
# Main execution
|
|
main() {
|
|
echo
|
|
echo "📡 C Nostr Relay - Health Monitor"
|
|
echo "================================="
|
|
echo
|
|
|
|
# Initialize log file
|
|
mkdir -p "$(dirname "$LOG_FILE")"
|
|
touch "$LOG_FILE"
|
|
|
|
parse_args "$@"
|
|
|
|
# Trap signals for cleanup
|
|
trap cleanup SIGINT SIGTERM
|
|
|
|
if [[ "$CONTINUOUS" == "true" ]]; then
|
|
print_step "Starting continuous monitoring (interval: ${CHECK_INTERVAL}s)"
|
|
print_step "Press Ctrl+C to stop"
|
|
|
|
while true; do
|
|
run_checks
|
|
sleep "$CHECK_INTERVAL"
|
|
done
|
|
else
|
|
run_checks
|
|
fi
|
|
|
|
show_statistics
|
|
}
|
|
|
|
# Run main function
|
|
main "$@" |