Troubleshooting Guide
Overview
This guide provides comprehensive troubleshooting procedures for common issues encountered when running and maintaining Savitri Network nodes. It covers diagnostic techniques, problem resolution, and preventive measures for all node types.
Table of Contents
- Prerequisites
- Diagnostic Tools
- Common Issues
- Node-Specific Troubleshooting
- Network Issues
- Storage Issues
- Performance Issues
- Security Issues
- Recovery Procedures
- Preventive Maintenance
Prerequisites
Required Tools
# System monitoring tools
sudo apt install -y \
htop \
iotop \
nethogs \
strace \
lsof \
netstat \
tcpdump \
wireshark
# Debugging tools
sudo apt install -y \
gdb \
valgrind \
perf \
massif-visualizer
# Log analysis tools
sudo apt install -y \
jq \
awk \
sed \
grep \
less
Debug Configuration
Enable Debug Logging:
# Set RUST_LOG environment variable
export RUST_LOG=debug,savitri=trace
# Enable backtraces
export RUST_BACKTRACE=1
# Full backtraces for detailed debugging
export RUST_BACKTRACE=full
Node Configuration for Debugging:
# node.toml
[debug]
enable_profiling = true
enable_tracing = true
log_level = "trace"
metrics_interval_seconds = 10
panic_on_errors = false
[debug.profiling]
cpu_profiling = true
memory_profiling = true
block_profiling = true
Diagnostic Tools
System Diagnostics
System Health Check:
#!/bin/bash
# system-health-check.sh
echo "=== System Health Check ==="
echo "Time: $(date)"
echo "Hostname: $(hostname)"
echo "Uptime: $(uptime -p)"
echo ""
echo "=== CPU Information ==="
lscpu | grep -E "(Model name|CPU\(s\)|Thread|Core)"
echo ""
echo "=== Memory Information ==="
free -h
echo ""
echo "=== Disk Information ==="
df -h
echo ""
echo "=== Network Information ==="
ip addr show | grep -E "(inet|UP|DOWN)"
echo ""
echo "=== Process Information ==="
ps aux | grep savitri | head -10
echo ""
echo "=== Service Status ==="
systemctl status savitri-node --no-pager
echo ""
Resource Usage Monitor:
#!/bin/bash
# resource-monitor.sh
NODE_PID=$(pgrep savitri-node)
if [ -z "$NODE_PID" ]; then
echo "Savitri node not running"
exit 1
fi
echo "=== Resource Usage for PID $NODE_PID ==="
echo "CPU: $(ps -p $NODE_PID -o %cpu --no-headers)%"
echo "Memory: $(ps -p $NODE_PID -o %mem --no-headers)%"
echo "Threads: $(ps -p $NODE_PID -o nlwp --no-headers)"
echo "FD Count: $(lsof -p $NODE_PID | wc -l)"
echo ""
echo "=== I/O Statistics ==="
iotop -p $NODE_PID -b -n 1 2>/dev/null || echo "iotop not available"
echo ""
echo "=== Network Connections ==="
netstat -p | grep $NODE_PID | head -10
echo ""
Blockchain Diagnostics
Node Status Check:
// diagnostic.rs
use std::process::Command;
pub struct NodeDiagnostics {
pub node_id: String,
pub block_height: u64,
pub peer_count: u64,
pub sync_status: SyncStatus,
pub consensus_status: ConsensusStatus,
pub mempool_size: u64,
}
impl NodeDiagnostics {
pub async fn collect() -> Result<Self, DiagnosticError> {
let node_id = Self::get_node_id().await?;
let block_height = Self::get_block_height().await?;
let peer_count = Self::get_peer_count().await?;
let sync_status = Self::get_sync_status().await?;
let consensus_status = Self::get_consensus_status().await?;
let mempool_size = Self::get_mempool_size().await?;
Ok(Self {
node_id,
block_height,
peer_count,
sync_status,
consensus_status,
mempool_size,
})
}
async fn get_node_id() -> Result<String, DiagnosticError> {
let output = Command::new("curl")
.args(&["-s", "http://localhost:30333/node_id"])
.output()
.await?;
Ok(String::from_utf8(output.stdout)?)
}
// Similar methods for other diagnostics...
}
Consensus Health Check:
#!/bin/bash
# consensus-health.sh
echo "=== Consensus Health Check ==="
# Check if node is in committee
COMMITTEE_STATUS=$(curl -s http://localhost:30333/consensus/status | jq -r '.in_committee')
if [ "$COMMITTEE_STATUS" = "true" ]; then
echo "✓ Node is in consensus committee"
# Get current slot
CURRENT_SLOT=$(curl -s http://localhost:30333/consensus/slot | jq -r '.current_slot')
echo "Current slot: $CURRENT_SLOT"
# Check last block produced
LAST_BLOCK=$(curl -s http://localhost:30333/consensus/last_block | jq -r '.height')
echo "Last block produced: $LAST_BLOCK"
# Check participation rate
PARTICIPATION=$(curl -s http://localhost:30333/consensus/participation | jq -r '.participation_rate')
echo "Participation rate: $PARTIPATION%"
if (( $(echo "$PARTIPATION < 95" | bc -l) )); then
echo "⚠️ Low participation rate detected"
fi
else
echo "ℹ️ Node is not in consensus committee"
fi
echo ""
Common Issues
Node Startup Issues
Issue: Node fails to start
# Check configuration file
./savitri-node --config node.toml --check-config
# Validate genesis file
./savitri-node --validate-genesis genesis.json
# Check permissions
ls -la /var/lib/savitri/
ls -la /etc/savitri/
# Check for port conflicts
netstat -tlnp | grep :30333
Issue: Configuration errors
# Validate TOML configuration
python3 -c "import tomllib; tomllib.load(open('node.toml'))"
# Check required fields
grep -E "^(network|storage|consensus)" node.toml
# Validate network settings
curl -s http://localhost:30333/config | jq .
Sync Issues
Issue: Node not syncing
// sync-diagnostics.rs
pub struct SyncDiagnostics {
pub current_height: u64,
pub target_height: u64,
pub sync_speed: f64,
pub peer_count: u64,
pub sync_status: SyncStatus,
}
impl SyncDiagnostics {
pub async fn diagnose_sync_issues() -> Vec<SyncIssue> {
let mut issues = Vec::new();
let diagnostics = Self::collect().await?;
// Check if stuck
if diagnostics.sync_speed < 0.1 {
issues.push(SyncIssue::SlowSync);
}
// Check peer connectivity
if diagnostics.peer_count < 3 {
issues.push(SyncIssue::InsufficientPeers);
}
// Check if far behind
let height_gap = diagnostics.target_height - diagnostics.current_height;
if height_gap > 1000 {
issues.push(SyncIssue::FarBehind);
}
issues
}
}
Sync Recovery Commands:
# Reset sync state
./savitri-node --reset-sync-state
# Force resync from trusted checkpoint
./savitri-node --force-resync --checkpoint-height 1000000
# Clear bad blocks
./savitri-node --clear-bad-blocks
# Rebuild index
./savitri-node --rebuild-index
Memory Issues
Issue: High memory usage
# Check memory usage
ps aux | grep savitri-node | awk '{print $4, $11}'
# Monitor memory over time
watch -n 5 'ps aux | grep savitri-node | head -5'
# Check for memory leaks
valgrind --tool=massif ./savitri-node
# Analyze heap usage
heaptrack ./savitri-node
Memory Optimization:
// memory-config.rs
pub struct MemoryConfig {
pub max_cache_size_mb: u64,
pub gc_threshold_percent: f64,
pub memory_limit_mb: u64,
pub enable_memory_profiling: bool,
}
impl Default for MemoryConfig {
fn default() -> Self {
Self {
max_cache_size_mb: 1024,
gc_threshold_percent: 80.0,
memory_limit_mb: 4096,
enable_memory_profiling: false,
}
}
}
Network Issues
Issue: Peer connection problems
# Check peer connectivity
curl -s http://localhost:30333/peers | jq '.peers | length'
# Test network connectivity
ping -c 3 bootstrap-node.savitri.network
# Check firewall rules
sudo ufw status
sudo iptables -L
# Monitor network traffic
tcpdump -i eth0 port 30333
Network Recovery:
# Reset peer connections
curl -X POST http://localhost:30333/peers/reset
# Add bootstrap nodes manually
curl -X POST http://localhost:30333/peers/add \
-H "Content-Type: application/json" \
-d '{"address": "/ip4/1.2.3.4/tcp/30333"}'
# Check DNS resolution
nslookup bootstrap-node.savitri.network
Node-Specific Troubleshooting
Full Node Issues
Issue: Block validation failures
// validation-diagnostics.rs
pub struct ValidationDiagnostics {
pub failed_blocks: u64,
pub validation_errors: Vec<ValidationError>,
pub last_valid_block: u64,
pub validation_speed: f64,
}
impl ValidationDiagnostics {
pub async fn diagnose_validation_issues() -> Vec<ValidationIssue> {
let mut issues = Vec::new();
// Check for consistent validation failures
if self.failed_blocks > 10 {
issues.push(ValidationIssue::ConsistentFailures);
}
// Check validation speed
if self.validation_speed < 1.0 {
issues.push(ValidationIssue::SlowValidation);
}
// Analyze error patterns
for error in &self.validation_errors {
match error.error_type {
ErrorType::Signature => issues.push(ValidationIssue::SignatureErrors),
ErrorType::StateRoot => issues.push(ValidationIssue::StateRootMismatch),
ErrorType::Timestamp => issues.push(ValidationIssue::TimestampInvalid),
}
}
issues
}
}
Full Node Recovery:
# Reset validation cache
./savitri-node --reset-validation-cache
# Rebuild state from blocks
./savitri-node --rebuild-state
# Clear corrupted blocks
./savitri-node --clear-corrupted-blocks
# Verify database integrity
./savitri-node --verify-database
Validator Node Issues
Issue: Slashing events
// slashing-diagnostics.rs
pub struct SlashingDiagnostics {
pub slashing_events: Vec<SlashingEvent>,
pub bond_status: BondStatus,
pub participation_rate: f64,
pub last_block_produced: u64,
}
impl SlashingDiagnostics {
pub async fn diagnose_slashing_risk() -> Vec<SlashingRisk> {
let mut risks = Vec::new();
// Check participation rate
if self.participation_rate < 95.0 {
risks.push(SlashingRisk::LowParticipation);
}
// Check if missing blocks
let current_height = get_current_height().await?;
if current_height - self.last_block_produced > 100 {
risks.push(SlashingRisk::MissedBlocks);
}
// Analyze past slashing events
for event in &self.slashing_events {
match event.reason {
SlashingReason::Equivocation => risks.push(SlashingRisk::EquivocationRisk),
SlashingReason::Downtime => risks.push(SlashingRisk::DowntimeRisk),
}
}
risks
}
}
Validator Recovery:
# Check bond status
curl -s http://localhost:30333/validator/bond | jq .
# Report slashing evidence
curl -X POST http://localhost:30333/validator/report-evidence \
-H "Content-Type: application/json" \
-d '{"evidence": "..."}'
# Withdraw bond (if necessary)
./savitri-node --withdraw-bond --amount 1000000
# Re-register as validator
./savitri-node --register-validator --bond 1000000
Archive Node Issues
Issue: Query performance problems
// query-diagnostics.rs
pub struct QueryDiagnostics {
pub average_query_time_ms: f64,
pub slow_queries: Vec<SlowQuery>,
pub cache_hit_rate: f64,
pub index_usage: HashMap<String, f64>,
}
impl QueryDiagnostics {
pub async fn diagnose_query_issues() -> Vec<QueryIssue> {
let mut issues = Vec::new();
// Check query performance
if self.average_query_time_ms > 1000.0 {
issues.push(QueryIssue::SlowQueries);
}
// Check cache efficiency
if self.cache_hit_rate < 80.0 {
issues.push(QueryIssue::LowCacheHitRate);
}
// Analyze slow queries
for query in &self.slow_queries {
if query.execution_time_ms > 5000.0 {
issues.push(QueryIssue::VerySlowQuery(query.clone()));
}
}
// Check index usage
for (index, usage) in &self.index_usage {
if *usage < 50.0 {
issues.push(QueryIssue::UnusedIndex(index.clone()));
}
}
issues
}
}
Archive Node Recovery:
# Rebuild query indexes
./savitri-node --rebuild-indexes
# Clear query cache
./savitri-node --clear-query-cache
# Optimize database
./savitri-node --optimize-database
# Compact storage
./savitri-node --compact-storage
RPC Node Issues
Issue: API rate limiting
// rpc-diagnostics.rs
pub struct RpcDiagnostics {
pub requests_per_second: f64,
pub rate_limited_requests: u64,
pub average_response_time_ms: f64,
pub active_connections: u64,
}
impl RpcDiagnostics {
pub async fn diagnose_rpc_issues() -> Vec<RpcIssue> {
let mut issues = Vec::new();
// Check request rate
if self.requests_per_second > 1000.0 {
issues.push(RpcIssue::HighRequestRate);
}
// Check rate limiting
if self.rate_limited_requests > 100 {
issues.push(RpcIssue::ExcessiveRateLimiting);
}
// Check response time
if self.average_response_time_ms > 500.0 {
issues.push(RpcIssue::SlowResponse);
}
// Check connection limits
if self.active_connections > 1000 {
issues.push(RpcIssue::ConnectionLimit);
}
issues
}
}
RPC Node Recovery:
# Reset rate limiter
curl -X POST http://localhost:30333/rpc/reset-rate-limiter
# Clear connection cache
curl -X POST http://localhost:30333/rpc/clear-cache
# Adjust rate limits
curl -X POST http://localhost:30333/rpc/update-config \
-H "Content-Type: application/json" \
-d '{"rate_limit": 2000, "connection_limit": 2000}'
Light Node Issues
Issue: Synchronization problems
// light-node-diagnostics.rs
pub struct LightNodeDiagnostics {
pub header_sync_height: u64,
pub trusted_peers: u64,
pub sync_status: LightSyncStatus,
pub battery_usage: f64,
pub data_usage_mb: u64,
}
impl LightNodeDiagnostics {
pub async fn diagnose_light_node_issues() -> Vec<LightNodeIssue> {
let mut issues = Vec::new();
// Check trusted peers
if self.trusted_peers < 3 {
issues.push(LightNodeIssue::InsufficientTrustedPeers);
}
// Check sync progress
match self.sync_status {
LightSyncStatus::Stalled => issues.push(LightNodeIssue::SyncStalled),
LightSyncStatus::Slow => issues.push(LightNodeIssue::SlowSync),
_ => {}
}
// Check battery usage (mobile)
if self.battery_usage > 20.0 {
issues.push(LightNodeIssue::HighBatteryUsage);
}
// Check data usage
if self.data_usage_mb > 1000 {
issues.push(LightNodeIssue::HighDataUsage);
}
issues
}
}
Light Node Recovery:
# Reset trusted peers
./savitri-light-node --reset-trusted-peers
# Add trusted bootstrap nodes
./savitri-light-node --add-trusted-peer "/ip4/1.2.3.4/tcp/30333"
# Clear header cache
./savitri-light-node --clear-header-cache
# Optimize for mobile
./savitri-light-node --mobile-mode --low-power
Network Issues
Connectivity Problems
Diagnose Network Issues:
#!/bin/bash
# network-diagnosis.sh
echo "=== Network Diagnosis ==="
# Check basic connectivity
echo "Checking internet connectivity..."
ping -c 3 8.8.8.8 > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "✓ Internet connectivity OK"
else
echo "✗ Internet connectivity failed"
fi
# Check DNS resolution
echo "Checking DNS resolution..."
nslookup bootstrap-node.savitri.network > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "✓ DNS resolution OK"
else
echo "✗ DNS resolution failed"
fi
# Check port availability
echo "Checking port 30333..."
netstat -tlnp | grep :30333 > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "✓ Port 30333 is listening"
else
echo "✗ Port 30333 is not listening"
fi
# Check firewall
echo "Checking firewall rules..."
sudo ufw status | grep 30333 > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "✓ Firewall allows port 30333"
else
echo "⚠️ Firewall may block port 30333"
fi
echo ""
Network Recovery:
# Restart networking
sudo systemctl restart networking
# Flush DNS cache
sudo systemd-resolve --flush-caches
# Reset firewall rules
sudo ufw --force reset
sudo ufw allow 30333
sudo ufw enable
# Restart node
sudo systemctl restart savitri-node
Peer Discovery Issues
Diagnose Peer Discovery:
// peer-discovery-diagnostics.rs
pub struct PeerDiscoveryDiagnostics {
pub bootstrap_nodes: Vec<String>,
pub discovered_peers: u64,
pub connected_peers: u64,
pub discovery_time_ms: f64,
}
impl PeerDiscoveryDiagnostics {
pub async fn diagnose_peer_discovery() -> Vec<PeerDiscoveryIssue> {
let mut issues = Vec::new();
// Check bootstrap nodes
if self.bootstrap_nodes.is_empty() {
issues.push(PeerDiscoveryIssue::NoBootstrapNodes);
}
// Check discovery success
if self.discovered_peers < 10 {
issues.push(PeerDiscoveryIssue::LowDiscoveryRate);
}
// Check connection success
let connection_rate = self.connected_peers as f64 / self.discovered_peers as f64;
if connection_rate < 0.5 {
issues.push(PeerDiscoveryIssue::LowConnectionRate);
}
// Check discovery speed
if self.discovery_time_ms > 30000.0 {
issues.push(PeerDiscoveryIssue::SlowDiscovery);
}
issues
}
}
Peer Discovery Recovery:
# Reset peer discovery
curl -X POST http://localhost:30333/peers/discovery/reset
# Add bootstrap nodes
curl -X POST http://localhost:30333/peers/bootstrap/add \
-H "Content-Type: application/json" \
-d '{"nodes": ["/ip4/1.2.3.4/tcp/30333"]}'
# Force peer discovery
curl -X POST http://localhost:30333/peers/discovery/force
# Clear peer cache
curl -X POST http://localhost:30333/peers/cache/clear
Storage Issues
Database Corruption
Diagnose Database Issues:
#!/bin/bash
# database-diagnosis.sh
DB_PATH="/var/lib/savitri/db"
echo "=== Database Diagnosis ==="
# Check database file integrity
echo "Checking database integrity..."
if [ -f "$DB_PATH/CURRENT" ]; then
echo "✓ Database files exist"
# Check file permissions
ls -la "$DB_PATH" | head -5
# Check disk space
DISK_USAGE=$(df "$DB_PATH" | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $DISK_USAGE -gt 90 ]; then
echo "⚠️ High disk usage: $DISK_USAGE%"
else
echo "✓ Disk usage OK: $DISK_USAGE%"
fi
# Check for lock files
if [ -f "$DB_PATH/LOCK" ]; then
echo "⚠️ Database lock file exists"
else
echo "✓ No database lock"
fi
else
echo "✗ Database files missing"
fi
echo ""
Database Recovery:
# Create database backup
cp -r /var/lib/savitri/db /var/lib/savitri/db.backup.$(date +%s)
# Run database repair
./savitri-node --repair-database
# Rebuild from scratch (last resort)
./savitri-node --rebuild-database
# Verify database integrity
./savitri-node --verify-database
Storage Performance Issues
Diagnose Storage Performance:
// storage-diagnostics.rs
pub struct StorageDiagnostics {
pub read_latency_ms: f64,
pub write_latency_ms: f64,
pub read_throughput_mbps: f64,
pub write_throughput_mbps: f64,
pub iops: u64,
pub disk_usage_percent: f64,
}
impl StorageDiagnostics {
pub async fn diagnose_storage_issues() -> Vec<StorageIssue> {
let mut issues = Vec::new();
// Check read latency
if self.read_latency_ms > 10.0 {
issues.push(StorageIssue::HighReadLatency);
}
// Check write latency
if self.write_latency_ms > 50.0 {
issues.push(StorageIssue::HighWriteLatency);
}
// Check throughput
if self.read_throughput_mbps < 100.0 {
issues.push(StorageIssue::LowReadThroughput);
}
if self.write_throughput_mbps < 50.0 {
issues.push(StorageIssue::LowWriteThroughput);
}
// Check disk space
if self.disk_usage_percent > 90.0 {
issues.push(StorageIssue::LowDiskSpace);
}
issues
}
}
Storage Optimization:
# Optimize database
./savitri-node --optimize-database
# Compact storage
./savitri-node --compact-storage
# Clear old data
./savitri-node --prune-old-data --blocks-to-keep 100000
# Rebuild indexes
./savitri-node --rebuild-indexes
# Adjust cache size
./savitri-node --cache-size 2048
Performance Issues
CPU Performance
Diagnose CPU Issues:
#!/bin/bash
# cpu-diagnosis.sh
NODE_PID=$(pgrep savitri-node)
if [ -z "$NODE_PID" ]; then
echo "Savitri node not running"
exit 1
fi
echo "=== CPU Diagnosis ==="
# Check CPU usage
CPU_USAGE=$(ps -p $NODE_PID -o %cpu --no-headers)
echo "CPU Usage: $CPU_USAGE%"
# Check thread count
THREAD_COUNT=$(ps -p $NODE_PID -o nlwp --no-headers)
echo "Thread Count: $THREAD_COUNT"
# Check CPU affinity
taskset -pc $NODE_PID
# Profile CPU usage
perf top -p $NODE_PID
echo ""
CPU Optimization:
// cpu-config.rs
pub struct CpuConfig {
pub worker_threads: usize,
pub io_threads: usize,
pub enable_cpu_profiling: bool,
pub cpu_affinity: Option<Vec<usize>>,
}
impl Default for CpuConfig {
fn default() -> Self {
Self {
worker_threads: num_cpus::get(),
io_threads: num_cpus::get() / 2,
enable_cpu_profiling: false,
cpu_affinity: None,
}
}
}
Memory Performance
Diagnose Memory Issues:
#!/bin/bash
# memory-diagnosis.sh
NODE_PID=$(pgrep savitri-node)
if [ -z "$NODE_PID" ]; then
echo "Savitri node not running"
exit 1
fi
echo "=== Memory Diagnosis ==="
# Check memory usage
MEMORY_USAGE=$(ps -p $NODE_PID -o %mem --no-headers)
echo "Memory Usage: $MEMORY_USAGE%"
# Check resident set size
RSS=$(ps -p $NODE_PID -o rss --no-headers)
echo "RSS: $RSS KB"
# Check virtual memory size
VSZ=$(ps -p $NODE_PID -o vsz --no-headers)
echo "VSZ: $VSZ KB"
# Check for memory leaks
valgrind --tool=massif --pages-as-heap=yes ./savitri-node
echo ""
Memory Optimization:
// memory-optimizer.rs
pub struct MemoryOptimizer {
pub gc_interval_seconds: u64,
pub max_heap_size_mb: u64,
pub enable_memory_pooling: bool,
pub pool_size_mb: u64,
}
impl MemoryOptimizer {
pub fn optimize_memory_usage(&self) {
// Enable memory pooling
if self.enable_memory_pooling {
self.enable_memory_pool();
}
// Set up periodic garbage collection
self.setup_gc_scheduler();
// Monitor memory usage
self.start_memory_monitoring();
}
}
Security Issues
Authentication Problems
Diagnose Authentication Issues:
// auth-diagnostics.rs
pub struct AuthDiagnostics {
pub failed_logins: u64,
pub active_sessions: u64,
pub last_login_attempt: SystemTime,
pub security_events: Vec<SecurityEvent>,
}
impl AuthDiagnostics {
pub async fn diagnose_auth_issues() -> Vec<AuthIssue> {
let mut issues = Vec::new();
// Check for brute force attacks
if self.failed_logins > 100 {
issues.push(AuthIssue::BruteForceAttack);
}
// Check session management
if self.active_sessions > 1000 {
issues.push(AuthIssue::SessionOverflow);
}
// Analyze security events
for event in &self.security_events {
match event.event_type {
SecurityEventType::SuspiciousLogin => issues.push(AuthIssue::SuspiciousActivity),
SecurityEventType::PrivilegeEscalation => issues.push(AuthIssue::SecurityBreach),
}
}
issues
}
}
Security Recovery:
# Reset authentication
curl -X POST http://localhost:30333/auth/reset
# Block suspicious IPs
curl -X POST http://localhost:30333/auth/block-ip \
-H "Content-Type: application/json" \
-d '{"ip": "1.2.3.4", "duration": 3600}'
# Clear sessions
curl -X POST http://localhost:30333/auth/clear-sessions
# Enable additional security
curl -X POST http://localhost:30333/auth/enable-2fa
Network Security
Diagnose Security Issues:
#!/bin/bash
# security-diagnosis.sh
echo "=== Security Diagnosis ==="
# Check for suspicious connections
echo "Checking active connections..."
netstat -an | grep :30333 | grep ESTABLISHED | wc -l
# Check for failed authentication attempts
echo "Checking auth logs..."
grep "authentication failed" /var/log/savitri/auth.log | tail -10
# Check firewall status
echo "Checking firewall..."
sudo ufw status
# Check for unusual processes
echo "Checking processes..."
ps aux | grep savitri | grep -v grep
echo ""
Recovery Procedures
Emergency Recovery
Complete Node Recovery:
#!/bin/bash
# emergency-recovery.sh
echo "=== Emergency Recovery Procedure ==="
# Stop the node
sudo systemctl stop savitri-node
# Create backup
BACKUP_DIR="/var/lib/savitri/backup/$(date +%s)"
mkdir -p "$BACKUP_DIR"
cp -r /var/lib/savitri/* "$BACKUP_DIR/"
# Clear corrupted data
rm -rf /var/lib/savitri/db/*
rm -rf /var/lib/savitri/cache/*
# Reset configuration
cp /etc/savitri/node.toml.default /etc/savitri/node.toml
# Restart with clean state
sudo systemctl start savitri-node
# Monitor recovery
watch -n 5 'curl -s http://localhost:30333/health | jq .'
echo ""
Partial Recovery:
# Reset only specific components
./savitri-node --reset-mempool
./savitri-node --reset-consensus-state
./savitri-node --reset-peer-table
# Recover from specific height
./savitri-node --recover-from-height 1000000
# Import trusted state
./savitri-node --import-state trusted-state.json
Data Recovery
Database Recovery:
# Restore from backup
./savitri-node --restore-from-backup /path/to/backup
# Repair corrupted database
./savitri-node --repair-database --force
# Rebuild missing indexes
./savitri-node --rebuild-missing-indexes
# Verify data integrity
./savitri-node --verify-data-integrity
Preventive Maintenance
Regular Maintenance Tasks
Maintenance Script:
#!/bin/bash
# maintenance.sh
echo "=== Preventive Maintenance ==="
# Clean old logs
find /var/log/savitri -name "*.log" -mtime +7 -delete
# Compact database
./savitri-node --compact-storage
# Clear cache
./savitri-node --clear-cache
# Update peer list
curl -X POST http://localhost:30333/peers/refresh
# Check disk space
DISK_USAGE=$(df /var/lib/savitri | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $DISK_USAGE -gt 80 ]; then
echo "Warning: High disk usage ($DISK_USAGE%)"
./savitri-node --prune-old-data
fi
# Backup configuration
cp /etc/savitri/node.toml /etc/savitri/node.toml.backup.$(date +%s)
echo "Maintenance completed"
echo ""
Health Monitoring
Health Check Automation:
// health-monitor.rs
pub struct HealthMonitor {
pub check_interval_seconds: u64,
pub alert_thresholds: AlertThresholds,
pub notification_channels: Vec<NotificationChannel>,
}
impl HealthMonitor {
pub async fn start_monitoring(&self) {
let mut interval = tokio::time::interval(
Duration::from_secs(self.check_interval_seconds)
);
loop {
interval.tick().await;
let health = self.check_node_health().await;
if !health.is_healthy() {
self.send_alert(&health).await;
}
}
}
async fn check_node_health(&self) -> NodeHealth {
NodeHealth {
cpu_usage: self.get_cpu_usage().await,
memory_usage: self.get_memory_usage().await,
disk_usage: self.get_disk_usage().await,
network_status: self.get_network_status().await,
blockchain_status: self.get_blockchain_status().await,
}
}
}
Performance Tuning
Performance Optimization:
// performance-tuner.rs
pub struct PerformanceTuner {
pub target_cpu_usage: f64,
pub target_memory_usage: f64,
pub target_response_time_ms: f64,
}
impl PerformanceTuner {
pub async fn auto_tune(&mut self) {
let current_metrics = self.collect_metrics().await;
// Adjust worker threads based on CPU usage
if current_metrics.cpu_usage > self.target_cpu_usage {
self.reduce_worker_threads();
} else if current_metrics.cpu_usage < self.target_cpu_usage * 0.5 {
self.increase_worker_threads();
}
// Adjust cache size based on memory usage
if current_metrics.memory_usage > self.target_memory_usage {
self.reduce_cache_size();
} else if current_metrics.memory_usage < self.target_memory_usage * 0.7 {
self.increase_cache_size();
}
// Adjust network parameters
if current_metrics.average_response_time > self.target_response_time_ms {
self.optimize_network_settings();
}
}
}
This comprehensive troubleshooting guide provides systematic approaches to diagnosing and resolving issues across all Savitri Network node types, ensuring reliable operation and quick recovery from problems.