Add comprehensive thread exit monitoring and signal handling for debugging thread crashes

This commit is contained in:
Your Name
2025-08-16 16:36:28 -04:00
parent d4e609039e
commit fb45970bbb
3 changed files with 171 additions and 11 deletions

View File

@@ -1 +1 @@
0.1.27
0.1.28

Binary file not shown.

View File

@@ -16,6 +16,8 @@
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <signal.h>
#include <sys/types.h>
#include "nostr_core_lib/nostr_core/nostr_common.h" // Common definitions and init/cleanup
#include "nostr_core_lib/nostr_core/nip001.h" // Basic protocol functions
#include "nostr_core_lib/nostr_core/nip013.h" // Proof-of-work functions
@@ -27,6 +29,18 @@
#define DEFAULT_THREADS 4
#define DEFAULT_POW 2
// Thread exit codes
#define THREAD_EXIT_SUCCESS 0 // Found solution or normal completion
#define THREAD_EXIT_STOPPED 1 // Stopped by main thread
#define THREAD_EXIT_ERROR 2 // Error occurred
// Global variables for debugging
static volatile sig_atomic_t g_signal_received = 0;
static volatile int g_shutdown_requested = 0;
static pthread_t* g_thread_handles = NULL;
static int g_thread_count = 0;
static void* g_worker_contexts = NULL; // Will be cast to mining_context_t*
// Forward declarations for callbacks
typedef struct mining_context mining_context_t;
@@ -89,6 +103,13 @@ static void* miner_thread(void* arg);
static int mine_event(mining_context_t* ctx);
static void cleanup_context(mining_context_t* ctx);
// Signal handling and debugging functions
static void signal_handler(int sig);
static void install_signal_handlers(void);
static void log_thread_exit(int thread_id, void* exit_status, const char* reason);
static const char* get_signal_name(int sig);
static void emergency_shutdown(void);
// Callback implementations
static void solution_found_callback(cJSON* solution, void* user_data);
static void verbose_pow_callback(int current_difficulty, uint64_t nonce, void* user_data);
@@ -244,6 +265,90 @@ static char* read_stdin_json(void) {
return buffer;
}
// Signal handling and debugging functions
static const char* get_signal_name(int sig) {
switch (sig) {
case SIGSEGV: return "SIGSEGV (Segmentation fault)";
case SIGABRT: return "SIGABRT (Abort)";
case SIGFPE: return "SIGFPE (Floating point exception)";
case SIGBUS: return "SIGBUS (Bus error)";
case SIGINT: return "SIGINT (Interrupt)";
case SIGTERM: return "SIGTERM (Terminate)";
default: return "Unknown signal";
}
}
static void log_thread_exit(int thread_id, void* exit_status, const char* reason) {
time_t current_time = time(NULL);
struct tm* local_time = localtime(&current_time);
printf("[%02d:%02d:%02d] Thread %d exited: %s (status: %ld)\n",
local_time->tm_hour, local_time->tm_min, local_time->tm_sec,
thread_id, reason, (long)exit_status);
fflush(stdout);
}
static void emergency_shutdown(void) {
// Set global shutdown flag
g_shutdown_requested = 1;
// Signal all worker threads to stop if contexts are available
if (g_worker_contexts) {
mining_context_t* contexts = (mining_context_t*)g_worker_contexts;
for (int i = 0; i < g_thread_count; i++) {
contexts[i].should_stop = 1;
}
}
}
static void signal_handler(int sig) {
static volatile sig_atomic_t in_signal_handler = 0;
// Prevent recursive signal handling
if (in_signal_handler) {
return;
}
in_signal_handler = 1;
g_signal_received = sig;
// Log the signal (async-signal-safe functions only)
const char* sig_name = get_signal_name(sig);
// Write to stderr using async-signal-safe functions
write(STDERR_FILENO, "\n[SIGNAL] Received ", 19);
write(STDERR_FILENO, sig_name, strlen(sig_name));
write(STDERR_FILENO, "\n", 1);
// For fatal signals, try emergency shutdown
if (sig == SIGSEGV || sig == SIGABRT || sig == SIGFPE || sig == SIGBUS) {
write(STDERR_FILENO, "[SIGNAL] Attempting emergency shutdown...\n", 42);
emergency_shutdown();
// Reset signal handler to default and re-raise
signal(sig, SIG_DFL);
raise(sig);
} else if (sig == SIGINT || sig == SIGTERM) {
// Graceful shutdown for interrupt signals
write(STDERR_FILENO, "[SIGNAL] Initiating graceful shutdown...\n", 41);
emergency_shutdown();
}
in_signal_handler = 0;
}
static void install_signal_handlers(void) {
// Install handlers for crash signals
signal(SIGSEGV, signal_handler);
signal(SIGABRT, signal_handler);
signal(SIGFPE, signal_handler);
signal(SIGBUS, signal_handler);
// Install handlers for graceful shutdown
signal(SIGINT, signal_handler);
signal(SIGTERM, signal_handler);
}
// Callback implementations
static void solution_found_callback(cJSON* solution, void* user_data) {
main_context_t* main_ctx = (main_context_t*)user_data;
@@ -293,9 +398,10 @@ static void verbose_pow_callback(int current_difficulty, uint64_t nonce, void* u
fflush(stdout);
}
// Mining thread function - New callback-based approach
// Mining thread function - Enhanced with exit status monitoring
static void* miner_thread(void* arg) {
mining_context_t* ctx = (mining_context_t*)arg;
void* exit_status = (void*)(intptr_t)THREAD_EXIT_ERROR; // Default to error
// Initialize thread-specific timing for verbose mode
ctx->thread_start_time = time(NULL);
@@ -304,20 +410,22 @@ static void* miner_thread(void* arg) {
// Create a copy of the event for this thread
char* event_str = cJSON_Print(ctx->event);
if (!event_str) {
return NULL;
log_thread_exit(ctx->thread_id, exit_status, "JSON serialization failed");
return exit_status;
}
cJSON* local_event = cJSON_Parse(event_str);
free(event_str);
if (!local_event) {
return NULL;
log_thread_exit(ctx->thread_id, exit_status, "Event parsing failed");
return exit_status;
}
uint64_t attempts = 0;
// Mine until solution found or signaled to stop by main thread
while (!ctx->should_stop) {
while (!ctx->should_stop && !g_shutdown_requested) {
// Attempt mining with verbose callback if enabled
void (*progress_cb)(int, uint64_t, void*) = ctx->verbose_enabled ? verbose_pow_callback : NULL;
@@ -334,20 +442,37 @@ static void* miner_thread(void* arg) {
if (ctx->solution_callback) {
ctx->solution_callback(local_event, ctx->user_data);
}
exit_status = (void*)(intptr_t)THREAD_EXIT_SUCCESS;
log_thread_exit(ctx->thread_id, exit_status, "Solution found");
break; // Exit after reporting solution
}
// Check for emergency shutdown
if (g_shutdown_requested) {
exit_status = (void*)(intptr_t)THREAD_EXIT_STOPPED;
log_thread_exit(ctx->thread_id, exit_status, "Emergency shutdown");
break;
}
// Small delay to prevent CPU overuse and allow responsive stopping
usleep(100); // 0.1ms - more responsive to should_stop signal
}
// Normal stop by main thread
if (ctx->should_stop && !g_shutdown_requested) {
exit_status = (void*)(intptr_t)THREAD_EXIT_STOPPED;
log_thread_exit(ctx->thread_id, exit_status, "Stopped by main thread");
}
cJSON_Delete(local_event);
return NULL;
return exit_status;
}
// Main mining function - New hub-and-spoke model
// Main mining function - Enhanced with debugging and monitoring
static int mine_event(mining_context_t* ctx) {
// Install signal handlers for crash detection
install_signal_handlers();
// Set up main context for centralized control
main_context_t main_ctx;
memset(&main_ctx, 0, sizeof(main_context_t));
@@ -386,7 +511,13 @@ static int mine_event(mining_context_t* ctx) {
return -1;
}
// Set up global debugging variables
g_thread_handles = threads;
g_thread_count = ctx->thread_count;
g_worker_contexts = worker_contexts;
time_t start_time = time(NULL);
printf("[DEBUG] Starting %d mining threads...\n", ctx->thread_count);
// Start threads
for (int i = 0; i < ctx->thread_count; i++) {
@@ -398,18 +529,21 @@ static int mine_event(mining_context_t* ctx) {
}
// Wait for threads that were created
for (int j = 0; j < i; j++) {
pthread_join(threads[j], NULL);
void* exit_status;
pthread_join(threads[j], &exit_status);
log_thread_exit(j, exit_status, "Cleanup after creation failure");
}
free(threads);
free(worker_contexts);
pthread_mutex_destroy(&main_ctx.result_mutex);
return -1;
}
printf("[DEBUG] Thread %d started successfully\n", i);
}
// Main thread control loop - centralized monitoring
int result = 0;
while (!main_ctx.solution_found && !main_ctx.timeout_reached) {
while (!main_ctx.solution_found && !main_ctx.timeout_reached && !g_shutdown_requested) {
// Check for timeout
if (ctx->timeout_seconds > 0) {
time_t current_time = time(NULL);
@@ -420,30 +554,56 @@ static int mine_event(mining_context_t* ctx) {
}
}
// Check for signals
if (g_signal_received) {
printf("[DEBUG] Signal received, shutting down...\n");
break;
}
// Small sleep to avoid busy waiting
usleep(10000); // 10ms
}
// Signal all workers to stop
printf("[DEBUG] Signaling threads to stop...\n");
for (int i = 0; i < ctx->thread_count; i++) {
worker_contexts[i].should_stop = 1;
}
// Wait for all threads to finish
// Wait for all threads to finish and capture exit statuses
printf("[DEBUG] Waiting for threads to finish...\n");
for (int i = 0; i < ctx->thread_count; i++) {
pthread_join(threads[i], NULL);
void* exit_status;
pthread_join(threads[i], &exit_status);
// Log exit status if it wasn't already logged by the thread
long status_code = (long)exit_status;
if (status_code == THREAD_EXIT_ERROR) {
log_thread_exit(i, exit_status, "Thread error (not previously logged)");
}
}
// Handle results
if (main_ctx.solution_found && main_ctx.result_event) {
ctx->result_event = main_ctx.result_event; // Transfer ownership
result = 1; // Success
printf("[DEBUG] Solution found successfully\n");
} else if (main_ctx.timeout_reached) {
result = -1; // Timeout
printf("[DEBUG] Mining timed out\n");
} else if (g_shutdown_requested || g_signal_received) {
result = -3; // Signal/emergency shutdown
printf("[DEBUG] Emergency shutdown completed\n");
} else {
result = -2; // Error
printf("[DEBUG] Mining failed with error\n");
}
// Clear global debugging variables
g_thread_handles = NULL;
g_thread_count = 0;
g_worker_contexts = NULL;
// Cleanup
free(threads);
free(worker_contexts);