diff --git a/package-lock.json b/package-lock.json index 359787b6e..0e7f9488a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@shardeum-foundation/core", - "version": "2.16.0-prerelease.1", + "version": "2.16.0-prerelease.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@shardeum-foundation/core", - "version": "2.16.0-prerelease.1", + "version": "2.16.0-prerelease.4", "dependencies": { "@hapi/sntp": "3.1.1", "@mapbox/node-pre-gyp": "1.0.10", diff --git a/package.json b/package.json index ca95436a8..40adbab86 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@shardeum-foundation/core", - "version": "2.16.0-prerelease.1", + "version": "2.16.0-prerelease.4", "engines": { "node": "20.19.3" }, diff --git a/src/config/server.ts b/src/config/server.ts index 2739d40cb..1dba271be 100644 --- a/src/config/server.ts +++ b/src/config/server.ts @@ -71,7 +71,6 @@ const SERVER_CONFIG: StrictServerConfiguration = { maxRotatedPerCycle: 1, flexibleRotationDelta: 1, flexibleRotationEnabled: false, - enableDangerousProblematicNodeRemoval: false, enableProblematicNodeRemoval: false, enableProblematicNodeRemovalOnCycle: 20000, maxProblematicNodeRemovalsPerCycle: 1, @@ -79,6 +78,7 @@ const SERVER_CONFIG: StrictServerConfiguration = { problematicNodeRefutePercentageThreshold: 0.1, problematicNodeHistoryLength: 60, problematicNodeRemovalCycleFrequency: 5, + problematicNodeRemovalSafetyDelta: 1, // How far below minNodes before we stop removing problematic nodes // New flags for problematic node cache v2 useProblematicNodeCacheV2: false, // When true, use the new cache-based implementation enableProblematicNodeCacheBuilding: false, // Enable shadow mode cache building for validation diff --git a/src/debug/debug.ts b/src/debug/debug.ts index bf14cc0f2..11e42ba3b 100644 --- a/src/debug/debug.ts +++ b/src/debug/debug.ts @@ -198,7 +198,7 @@ class Debug { id: nodeId.substring(0, 8), fullId: nodeId, refuteCycles: node.refuteCycles || [], - consecutiveRefutes: ProblemNodeHandler.getConsecutiveRefutes( + consecutiveRefutes: ProblemNodeHandler.getMaxConsecutiveRefutes( node.refuteCycles || [], currentCycleRecord.counter ), @@ -231,11 +231,11 @@ class Debug { this.network.registerExternalGet('debug_simulateProblematic', isDebugModeMiddleware, (req, res) => { try { // Parse parameters with defaults - const missConsensus = req.query.missConsensus ? parseFloat(req.query.missConsensus as string) : 0 - const networkDelay = req.query.networkDelay ? parseInt(req.query.networkDelay as string) : 0 - const dropMessages = req.query.dropMessages ? parseFloat(req.query.dropMessages as string) : 0 - const slowResponse = req.query.slowResponse ? parseFloat(req.query.slowResponse as string) : 0 - const slowDelayMs = req.query.slowDelayMs ? parseInt(req.query.slowDelayMs as string) : 0 + const missConsensus = req.query.missConsensus ? parseFloat(req.query.missConsensus as string) : 1 + const networkDelay = req.query.networkDelay ? parseInt(req.query.networkDelay as string) : 100 + const dropMessages = req.query.dropMessages ? parseFloat(req.query.dropMessages as string) : 1 + const slowResponse = req.query.slowResponse ? parseFloat(req.query.slowResponse as string) : 1 + const slowDelayMs = req.query.slowDelayMs ? parseInt(req.query.slowDelayMs as string) : 10000 const reset = req.query.reset === 'true' // Validate parameters diff --git a/src/p2p/ModeSystemFuncs.ts b/src/p2p/ModeSystemFuncs.ts index 6823167ae..db21d2049 100644 --- a/src/p2p/ModeSystemFuncs.ts +++ b/src/p2p/ModeSystemFuncs.ts @@ -637,10 +637,6 @@ export function getExpiredRemovedV3( // filter out apoptosized nodes from the problematic nodes const problematicNodes = problematicWithApoptosizedNodes.filter((id) => !apoptosizedNodesList.includes(id)) const canRemoveProblematicNodesThisCycle = prevRecord.counter % config.p2p.problematicNodeRemovalCycleFrequency === 0 - const numProblematicRemovals = Math.min( - problematicNodes.length, - canRemoveProblematicNodesThisCycle ? config.p2p.maxProblematicNodeRemovalsPerCycle || 1 : 0 - ) // we can remove `remove` nodes, but we *must* remove the number of apoptosized nodes, // the remainder is the number of expired nodes we can remove this cycle @@ -648,6 +644,24 @@ export function getExpiredRemovedV3( const numPotentiallyExpiredRemovals = Math.max(remove - numApoptosizedRemovals, 0) const numExpiredRemovals = Math.min(numPotentiallyExpiredRemovals, numExpiredNodes) + // Calculate the safety threshold for problematic node removal + const safetyThreshold = config.p2p.minNodes - config.p2p.problematicNodeRemovalSafetyDelta + const activeAfterOtherRemovals = active - numApoptosizedRemovals - numExpiredRemovals + add + + // Check if we're below the safety threshold or would go below it + let numProblematicRemovals = 0 + let problematicRemovalPrevented = false + if (activeAfterOtherRemovals > safetyThreshold && canRemoveProblematicNodesThisCycle) { + // Calculate how many problematic nodes we can safely remove without going below threshold + const maxSafeProblematicRemovals = Math.max(0, activeAfterOtherRemovals - safetyThreshold) + const maxConfiguredRemovals = config.p2p.maxProblematicNodeRemovalsPerCycle + + numProblematicRemovals = Math.min(problematicNodes.length, maxConfiguredRemovals, maxSafeProblematicRemovals) + } else if (canRemoveProblematicNodesThisCycle && problematicNodes.length > 0) { + // We would have removed problematic nodes but safety threshold prevents it + problematicRemovalPrevented = true + } + const cycle = CycleChain.newest.counter if (cycle > lastLoggedCycle && remove > 0) { @@ -675,14 +689,23 @@ export function getExpiredRemovedV3( canRemoveProblematicNodesThisCycle: canRemoveProblematicNodesThisCycle, numPotentiallyExpiredRemovals: numPotentiallyExpiredRemovals, numExpiredRemovals: numExpiredRemovals, + activeAfterOtherRemovals: activeAfterOtherRemovals, + safetyThreshold: safetyThreshold, + problematicRemovalPrevented: problematicRemovalPrevented, } nestedCountersInstance.countEvent('p2p', `results of getExpiredRemovedV3: ${JSON.stringify(counters)}`) + + if (problematicRemovalPrevented) { + nestedCountersInstance.countEvent( + 'p2p', + `problematic node removal prevented: active after removals (${activeAfterOtherRemovals}) <= safety threshold (${safetyThreshold})` + ) + } } - const dangerousRemoval = remove === 0 && config.p2p.enableDangerousProblematicNodeRemoval === false const nodesNeverExpire = config.p2p.nodeExpiryAge < 0 - // if its recommended that we dont remove any nodes, Or the nodes never expire, we MUST adhere to this. - if (dangerousRemoval || nodesNeverExpire) { + // if nodes never expire, we MUST adhere to this. + if (nodesNeverExpire) { return { problematic: 0, expired: 0, diff --git a/src/p2p/ProblemNodeHandler.ts b/src/p2p/ProblemNodeHandler.ts index c96a88725..b0128ac7f 100644 --- a/src/p2p/ProblemNodeHandler.ts +++ b/src/p2p/ProblemNodeHandler.ts @@ -82,7 +82,7 @@ export function exportProblematicNodeCache(): string | null { } try { - return problematicNodeCache.toCompressedJSON() + return problematicNodeCache.toJSON() } catch (err) { error('Failed to export ProblematicNodeCache:', err) return null @@ -95,6 +95,52 @@ export function getProblematicNodes(prevRecord: P2P.CycleCreatorTypes.CycleRecor const activeNodeIds = new Set(NodeList.activeByIdOrder.map((node) => node.id)) return problematicNodeCache.getProblematicNodes(prevRecord.counter, activeNodeIds) } + // Return empty array when feature is disabled + return [] +} + +// Get problematic node info for reporting to monitor - info about self node only +export function getProblematicNodeInfoForSelf(nodeId: string): any | null { + if (!problematicNodeCache || !config.p2p.enableProblematicNodeCacheBuilding) { + return null + } + + try { + const currentCycle = CycleChain.newest?.counter || 0 + + const metrics = problematicNodeCache.calculateNodeMetrics(nodeId, currentCycle) + const refuteHistory = problematicNodeCache.refuteHistory.get(nodeId) || [] + + // Get the cache's cycle range to determine what cycles we have data for + const cacheInfo = problematicNodeCache.getCycleCoverage() + let startCycle = cacheInfo.cycleRange ? cacheInfo.cycleRange.min : currentCycle + let endCycle = cacheInfo.cycleRange ? cacheInfo.cycleRange.max : currentCycle + + // Build cycle history for all cycles in the cache + const cycleRefuteHistory: boolean[] = [] + for (let cycle = startCycle; cycle <= endCycle; cycle++) { + cycleRefuteHistory.push(refuteHistory.includes(cycle)) + } + + // Calculate total refutes (all refutes in the cache) + const totalRefutes = refuteHistory.length + + const problematicNodeInfo = { + isProblematic: + metrics.consecutiveRefutes >= config.p2p.problematicNodeConsecutiveRefuteThreshold || + metrics.refutePercentage >= config.p2p.problematicNodeRefutePercentageThreshold, + totalRefutes: totalRefutes, + maxConsecutiveRefutes: metrics.consecutiveRefutes, + refutePercentage: metrics.refutePercentage, + cycleRefuteHistory: cycleRefuteHistory, + newestCycle: endCycle, // The newest cycle in cache + } + + return problematicNodeInfo + } catch (err) { + error('Failed to get problematic node info for self:', err) + return null + } } export function getRefutePercentage(refuteCycles: number[], currentCycle: number): number { @@ -104,9 +150,9 @@ export function getRefutePercentage(refuteCycles: number[], currentCycle: number return 0 } -export function getConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number { +export function getMaxConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number { if (config.p2p.useProblematicNodeCacheV2 && problematicNodeCache) { - return problematicNodeCache.getConsecutiveRefutes(refuteCycles, currentCycle) + return problematicNodeCache.getMaxConsecutiveRefutes(refuteCycles, currentCycle) } return 0 } diff --git a/src/p2p/ProblematicNodeCache.ts b/src/p2p/ProblematicNodeCache.ts index 4ffc64049..46471b8dd 100644 --- a/src/p2p/ProblematicNodeCache.ts +++ b/src/p2p/ProblematicNodeCache.ts @@ -196,7 +196,7 @@ export class ProblematicNodeCache { const refuteCycles = this.refuteHistory.get(nodeId) || [] // Calculate consecutive refutes - const consecutiveRefutes = this.getConsecutiveRefutes(refuteCycles, currentCycle) + const consecutiveRefutes = this.getMaxConsecutiveRefutes(refuteCycles, currentCycle) // Calculate refute percentage const refutePercentage = this.getRefutePercentage(refuteCycles, currentCycle) @@ -213,40 +213,38 @@ export class ProblematicNodeCache { return metrics } - getConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number { + getMaxConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number { if (refuteCycles.length === 0) return 0 // Filter to only include refutes up to current cycle const relevantRefutes = refuteCycles.filter((cycle) => cycle <= currentCycle) if (relevantRefutes.length === 0) return 0 - // Count consecutive refutes ending at current cycle or one before - let count = 0 + // Find the maximum consecutive refutes in the retained cycle history const sortedRefutes = [...relevantRefutes].sort((a, b) => a - b) - // Start from the end and count backwards - for (let i = sortedRefutes.length - 1; i >= 0; i--) { - const cycle = sortedRefutes[i] + let maxConsecutive = 1 // At least 1 if we have any refutes + let currentConsecutive = 1 - if (i === sortedRefutes.length - 1) { - // Last refute must be at current cycle or one before - if (cycle === currentCycle || cycle === currentCycle - 1) { - count = 1 - } else { - break - } + // Iterate through sorted refutes to find all consecutive sequences + for (let i = 1; i < sortedRefutes.length; i++) { + const currentCycle = sortedRefutes[i] + const prevCycle = sortedRefutes[i - 1] + + if (currentCycle === prevCycle + 1) { + // Consecutive with previous cycle + currentConsecutive++ } else { - // Check if consecutive with next cycle - const nextCycle = sortedRefutes[i + 1] - if (cycle === nextCycle - 1) { - count++ - } else { - break - } + // Gap found, update max and reset current count + maxConsecutive = Math.max(maxConsecutive, currentConsecutive) + currentConsecutive = 1 } } - return count + // Update max with final sequence + maxConsecutive = Math.max(maxConsecutive, currentConsecutive) + + return maxConsecutive } getRefutePercentage(refuteCycles: number[], currentCycle: number): number { @@ -274,7 +272,7 @@ export class ProblematicNodeCache { } // Sort by score and return top N based on maxProblematicNodeRemovalsPerCycle - const maxRemovals = this.config.p2p.maxProblematicNodeRemovalsPerCycle || 1 + const maxRemovals = this.config.p2p.maxProblematicNodeRemovalsPerCycle return problematicNodes .sort((a, b) => b.score - a.score) diff --git a/src/reporter/index.ts b/src/reporter/index.ts index 4c738d465..d6d3be8da 100644 --- a/src/reporter/index.ts +++ b/src/reporter/index.ts @@ -25,6 +25,7 @@ import { finishedSyncingCycle } from '../p2p/Join' import { currentCycle } from '../p2p/CycleCreator' import process from 'process' import { fireAndForget } from '../utils/functions/promises' +import * as ProblemNodeHandler from '../p2p/ProblemNodeHandler' const http = require('../http') const allZeroes64 = '0'.repeat(64) @@ -350,6 +351,9 @@ class Reporter { const archiverListHash = Archivers.getArchiverListHash() const lastInSyncResult = this.stateManager.accountPatcher.lastInSyncResult + // Get problematic node info for this validator + const problematicNodeInfo = ProblemNodeHandler.getProblematicNodeInfoForSelf(Self.id) + try { await this._sendReport({ repairsStarted, @@ -397,6 +401,7 @@ class Reporter { cycleFinishedSyncing, stillNeedsInitialPatchPostActive, memory: process.memoryUsage(), + problematicNodeInfo, }) if (this.stateManager != null && config.mode === 'debug' && !config.debug.disableTxCoverageReport) { this.stateManager.transactionQueue.resetTxCoverageMap() diff --git a/src/shardus/shardus-types.ts b/src/shardus/shardus-types.ts index a1711b797..65927d988 100644 --- a/src/shardus/shardus-types.ts +++ b/src/shardus/shardus-types.ts @@ -774,13 +774,13 @@ export interface ServerConfiguration { /** Problematic Node configurations */ /** enable problematic node removal */ enableProblematicNodeRemoval?: boolean - /** when true, we will remove problematic nodes even when calculateToAcceptV2 says we should not remove any nodes. This is useful in development when testing this feature. */ - enableDangerousProblematicNodeRemoval?: boolean /** enable problematic node removal on a specific cycle. This is to allow the network to stabilize before removing problematic nodes. * enableProblematicNodeRemoval must be true for this to take effect*/ enableProblematicNodeRemovalOnCycle?: number /** The problematicNodeRemovalCycleFrequency parameter is an Integer specifying the number of cycles between problematic node removals. */ problematicNodeRemovalCycleFrequency?: number + /** The problematicNodeRemovalSafetyDelta parameter is an Integer specifying how far below minNodes the active count can be before we stop removing problematic nodes. For example, if minNodes is 100 and this value is 10, we won't remove problematic nodes if active count < 90. */ + problematicNodeRemovalSafetyDelta?: number /** The maxProblematicNodeRemovalsPerCycle parameter is an Integer specifying the maximum number of problematic nodes that can be removed from the network each cycle. */ maxProblematicNodeRemovalsPerCycle?: number /** The problematicNodeConsecutiveRefuteThreshold parameter is an Integer specifying the number of consecutive refutes a node must have before it is considered problematic. */ diff --git a/test/unit/debug/index.test.ts b/test/unit/debug/index.test.ts index 7b1366c62..7503bb922 100644 --- a/test/unit/debug/index.test.ts +++ b/test/unit/debug/index.test.ts @@ -62,7 +62,7 @@ jest.mock('../../../src/p2p/CycleChain', () => ({ jest.mock('../../../src/p2p/ProblemNodeHandler', () => ({ getProblematicNodes: jest.fn().mockReturnValue([]), getRefutePercentage: jest.fn().mockReturnValue(0), - getConsecutiveRefutes: jest.fn().mockReturnValue(0), + getMaxConsecutiveRefutes: jest.fn().mockReturnValue(0), isNodeProblematic: jest.fn().mockReturnValue(false), exportProblematicNodeCache: jest.fn().mockReturnValue(null), })) diff --git a/test/unit/src/debug/debug.test.ts b/test/unit/src/debug/debug.test.ts index 52bc05ad9..93232c1f7 100644 --- a/test/unit/src/debug/debug.test.ts +++ b/test/unit/src/debug/debug.test.ts @@ -39,7 +39,7 @@ jest.mock('../../../../src/p2p/Context', () => ({ })) jest.mock('../../../../src/p2p/ProblemNodeHandler', () => ({ getRefutePercentage: jest.fn(), - getConsecutiveRefutes: jest.fn(), + getMaxConsecutiveRefutes: jest.fn(), isNodeProblematic: jest.fn(), })) jest.mock('../../../../src/utils/nestedCounters') diff --git a/test/unit/src/p2p/ModeSystemFuncs.test.ts b/test/unit/src/p2p/ModeSystemFuncs.test.ts index 7215b6d08..4aaea2d58 100644 --- a/test/unit/src/p2p/ModeSystemFuncs.test.ts +++ b/test/unit/src/p2p/ModeSystemFuncs.test.ts @@ -36,9 +36,9 @@ jest.mock('../../../../src/p2p/Context', () => ({ rotationCountMultiply: 1, rotationCountAdd: 0, rotationPercentActive: 0.001, - enableDangerousProblematicNodeRemoval: false, problematicNodeRemovalCycleFrequency: 10, maxProblematicNodeRemovalsPerCycle: 1, + problematicNodeRemovalSafetyDelta: 1, }, debug: { verboseNestedCounters: false, @@ -174,9 +174,9 @@ describe('ModeSystemFuncs', () => { rotationCountMultiply: 1, rotationCountAdd: 0, rotationPercentActive: 0.001, - enableDangerousProblematicNodeRemoval: false, problematicNodeRemovalCycleFrequency: 10, maxProblematicNodeRemovalsPerCycle: 1, + problematicNodeRemovalSafetyDelta: 1, } // Reset mock functions @@ -1083,33 +1083,6 @@ describe('ModeSystemFuncs', () => { ;(getProblematicNodes as jest.Mock).mockReturnValue([]) }) - it('should return empty result when dangerous removal is prevented', () => { - ;(config.p2p.enableDangerousProblematicNodeRemoval as any) = false - ;(config.p2p.syncFloorEnabled as any) = false // Use old logic - - const prevRecord = { - start: Date.now(), - desired: 20, - counter: 1, - mode: 'processing', - lost: [], - } as any - - const txs = { apoptosis: [] } as any - - mockNodeList.activeByIdOrder = new Array(20) - mockNodeList.byJoinOrder = new Array(20) - mockTargetCount = 20 // No removals - - const result = ModeSystemFuncs.getExpiredRemovedV3(prevRecord, lastLoggedCycle, txs, mockInfo) - - expect(result).toEqual({ - problematic: 0, - expired: 0, - removed: [], - }) - }) - it('should return empty result when nodes never expire', () => { ;(config.p2p.nodeExpiryAge as any) = -1 @@ -1175,6 +1148,8 @@ describe('ModeSystemFuncs', () => { ;(config.p2p.problematicNodeRemovalCycleFrequency as any) = 10 ;(config.p2p.maxProblematicNodeRemovalsPerCycle as any) = 2 ;(config.p2p.syncFloorEnabled as any) = false // Use old logic + ;(config.p2p.minNodes as any) = 10 + ;(config.p2p.problematicNodeRemovalSafetyDelta as any) = 1 const prevRecord = { start: now, @@ -1184,6 +1159,16 @@ describe('ModeSystemFuncs', () => { lost: [], } as any + // Create enough nodes to be above safety threshold + const nodes = [] + for (let i = 0; i < 15; i++) { + nodes.push({ + id: `node${i}`, + status: 'active', + activeTimestamp: now - 100000, // Not expired + }) + } + const problematicNode1 = { id: 'prob1', status: 'active', @@ -1195,9 +1180,11 @@ describe('ModeSystemFuncs', () => { activeTimestamp: now - 100000, // Not expired } - mockNodeList.byJoinOrder = [problematicNode1, problematicNode2] - mockNodeList.activeByIdOrder = [problematicNode1, problematicNode2] - mockTargetCount = 0 // Remove 2 nodes + // Add problematic nodes to the node list + const allNodes = [...nodes, problematicNode1, problematicNode2] + mockNodeList.byJoinOrder = allNodes + mockNodeList.activeByIdOrder = allNodes + mockTargetCount = 15 // Remove 2 nodes (17 active - 15 target = 2 to remove) ;(getProblematicNodes as jest.Mock).mockReturnValue(['prob1', 'prob2']) const txs = { apoptosis: [] } as any @@ -1284,6 +1271,9 @@ describe('ModeSystemFuncs', () => { ;(config.p2p.nodeExpiryAge as any) = 300000 ;(config.p2p.problematicNodeRemovalCycleFrequency as any) = 10 ;(config.p2p.syncFloorEnabled as any) = false // Use old logic + ;(config.p2p.minNodes as any) = 10 + ;(config.p2p.problematicNodeRemovalSafetyDelta as any) = 1 + ;(config.p2p.maxProblematicNodeRemovalsPerCycle as any) = 2 const prevRecord = { start: now, @@ -1293,6 +1283,16 @@ describe('ModeSystemFuncs', () => { lost: [], } as any + // Create enough nodes to be above safety threshold + const nodes = [] + for (let i = 0; i < 15; i++) { + nodes.push({ + id: `node${i}`, + status: 'active', + activeTimestamp: now - 100000, // Not expired + }) + } + const problematicNode = { id: 'prob1', status: 'active', @@ -1304,13 +1304,14 @@ describe('ModeSystemFuncs', () => { activeTimestamp: now - 100000, } - mockNodeList.byJoinOrder = [problematicNode, apoptosisProblematicNode] - mockNodeList.activeByIdOrder = [problematicNode, apoptosisProblematicNode] + const allNodes = [...nodes, problematicNode, apoptosisProblematicNode] + mockNodeList.byJoinOrder = allNodes + mockNodeList.activeByIdOrder = allNodes mockNodeList.nodes = new Map([ ['prob1', problematicNode], ['apopprob1', apoptosisProblematicNode], ]) - mockTargetCount = 0 // Remove 2 nodes + mockTargetCount = 16 // Remove 1 node (17 active - 16 target = 1 to remove) // Both nodes are problematic, but one is also apoptosis ;(getProblematicNodes as jest.Mock).mockReturnValue(['prob1', 'apopprob1']) diff --git a/test/unit/src/p2p/ProblematicNodeCache.test.ts b/test/unit/src/p2p/ProblematicNodeCache.test.ts index e64e8a2d4..b56cba967 100644 --- a/test/unit/src/p2p/ProblematicNodeCache.test.ts +++ b/test/unit/src/p2p/ProblematicNodeCache.test.ts @@ -539,10 +539,10 @@ describe('ProblematicNodeCache', () => { expect(metrics.consecutiveRefutes).toBe(6) // cycles 95-100 }) - test('should reset consecutive count after gap', () => { - // node2 is refuted in 96-98, gap at 99, no refute at 100 + test('should find maximum consecutive refutes in history', () => { + // node2 is refuted in 96-98, gap at 99-100, max consecutive is 3 const metrics = cache.calculateNodeMetrics('node2', 100) - expect(metrics.consecutiveRefutes).toBe(0) // Gap breaks the consecutive count + expect(metrics.consecutiveRefutes).toBe(3) // Maximum consecutive refutes in history }) test('should handle consecutive refutes at history boundary', () => {