diff --git a/op-node/rollup/derive/batches.go b/op-node/rollup/derive/batches.go index 861a8be953cf4..c826cac378e77 100644 --- a/op-node/rollup/derive/batches.go +++ b/op-node/rollup/derive/batches.go @@ -220,6 +220,8 @@ func checkSpanBatchPrefix(ctx context.Context, cfg *rollup.Config, log log.Logge nextTimestamp := l2SafeHead.Time + cfg.BlockTime + log.Info("checkSpanBatchPrefix", "nextTimestamp", nextTimestamp, "safe num", l2SafeHead.Number, "count", len(batch.Batches), "first", batch.Batches[0].Timestamp, "last", batch.Batches[len(batch.Batches)-1].Timestamp) + if batch.GetTimestamp() > nextTimestamp { if cfg.IsHolocene(l1InclusionBlock.Time) { log.Warn("dropping future span batch", "next_timestamp", nextTimestamp) diff --git a/test/3-op-init.sh b/test/3-op-init.sh index b032d3b5a5a2f..1bb06d9812f98 100755 --- a/test/3-op-init.sh +++ b/test/3-op-init.sh @@ -29,6 +29,10 @@ sed_inplace 's/"number": 0/"number": '"$NEXT_BLOCK_NUMBER"'/' ./config-op/rollup cp ./config-op/genesis.json ./config-op/genesis-reth.json sed_inplace 's/"number": "0x0"/"number": "'"$NEXT_BLOCK_NUMBER_HEX"'"/' ./config-op/genesis-reth.json +CURRENT_VALUE=$(jq -r '.genesis.l2_time' ./config-op/rollup.json) +NEW_VALUE=$((CURRENT_VALUE - 200)) +sed_inplace "s/\"l2_time\": $CURRENT_VALUE/\"l2_time\": $NEW_VALUE/" ./config-op/rollup.json + # Extract contract addresses from state.json and update .env file echo "🔧 Extracting contract addresses from state.json..." PWD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -165,6 +169,7 @@ OP_GETH_RPC_DATADIR="$(pwd)/data/op-geth-rpc" echo " 🔄 Copying database from op-geth-seq to op-geth-rpc..." rm -rf "$OP_GETH_RPC_DATADIR" cp -r "$OP_GETH_DATADIR" "$OP_GETH_RPC_DATADIR" +cp -r "$OP_GETH_DATADIR" "$OP_GETH_RPC_DATADIR"-bak if [ "$CONDUCTOR_ENABLED" = "true" ]; then if [ "$SEQ_TYPE" = "geth" ]; then @@ -189,6 +194,8 @@ fi echo "✅ Finished init op-$SEQ_TYPE-seq and op-$RPC_TYPE-rpc." +exit 0 + # genesis.json is too large to embed in go, so we compress it now and decompress it in go code gzip -c config-op/genesis.json > config-op/genesis.json.gz diff --git a/test/4-op-start-service.sh b/test/4-op-start-service.sh index 84c867da6888e..95b841813eb16 100755 --- a/test/4-op-start-service.sh +++ b/test/4-op-start-service.sh @@ -1,6 +1,5 @@ #!/bin/bash set -e -set -x # Load environment variables early source .env @@ -73,7 +72,8 @@ sleep 5 #$SCRIPTS_DIR/add-peers.sh if [ "$LAUNCH_RPC_NODE" = "true" ]; then - docker compose up -d op-rpc + echo "" + #docker compose up -d op-rpc fi # Configure op-batcher endpoints based on conductor mode @@ -86,13 +86,103 @@ if [ "$CONDUCTOR_ENABLED" = "true" ]; then else echo "🔧 Configuring op-batcher for single sequencer mode..." # Set single sequencer mode endpoints - export OP_BATCHER_L2_ETH_RPC="http://op-${SEQ_TYPE}-seq:8545" - export OP_BATCHER_ROLLUP_RPC="http://op-seq:9545" - echo "✅ op-batcher configured for single sequencer mode" +# export OP_BATCHER_L2_ETH_RPC="http://op-${SEQ_TYPE}-seq:8545" +# export OP_BATCHER_ROLLUP_RPC="http://op-seq:9545" +# echo "✅ op-batcher configured for single sequencer mode" fi +INIT_HEIGHT=8593921 +EXPECTED_WAIT_TIME=200 +TARGET_SAFE_HEIGHT=$INIT_HEIGHT +START_TIME=$(date +%s) +echo "⏳ Waiting for sequencer window to expire and safe height to exceed $TARGET_SAFE_HEIGHT... (expected wait time: ~${EXPECTED_WAIT_TIME}s)" +while true; do + CURRENT_SAFE=$(cast bn -r http://localhost:8123 safe 2>/dev/null || echo "0") + if [ "$CURRENT_SAFE" -gt "$TARGET_SAFE_HEIGHT" ]; then + echo "✅ Safe height reached: $CURRENT_SAFE (target: $TARGET_SAFE_HEIGHT)" + break + fi + ELAPSED_TIME=$(($(date +%s) - START_TIME)) + REMAINING_TIME=$((EXPECTED_WAIT_TIME - ELAPSED_TIME)) + if [ "$REMAINING_TIME" -lt 0 ]; then + REMAINING_TIME=0 + fi + echo " Current safe height: $CURRENT_SAFE, waiting for safe height > $TARGET_SAFE_HEIGHT... (elapsed: ${ELAPSED_TIME}s, remaining: ~${REMAINING_TIME}s)" + sleep 10 +done + docker compose up -d op-batcher +CURRENT_SAFE=$(cast bn -r http://localhost:8123 safe 2>/dev/null || echo "0") + +CHANNEL_TIMEOUT_GRANITE=50 +MARGIN=50 +L1_BLOCKTIME=2 +EXTRA_SAFE_HEIGHT=$((CHANNEL_TIMEOUT_GRANITE * L1_BLOCKTIME + MARGIN)) +TARGET_SAFE_HEIGHT=$((CURRENT_SAFE + EXTRA_SAFE_HEIGHT)) +EXPECTED_WAIT_TIME=$EXTRA_SAFE_HEIGHT +START_TIME=$(date +%s) +echo "⏳ Waiting for safe height to exceed $TARGET_SAFE_HEIGHT... (expected wait time: ~${EXPECTED_WAIT_TIME}s)" +while true; do + CURRENT_SAFE=$(cast bn -r http://localhost:8123 safe 2>/dev/null || echo "0") + if [ "$CURRENT_SAFE" -gt "$TARGET_SAFE_HEIGHT" ]; then + echo "✅ Safe height reached: $CURRENT_SAFE (target: $TARGET_SAFE_HEIGHT)" + break + fi + ELAPSED_TIME=$(($(date +%s) - START_TIME)) + REMAINING_TIME=$((EXPECTED_WAIT_TIME - ELAPSED_TIME)) + if [ "$REMAINING_TIME" -lt 0 ]; then + REMAINING_TIME=0 + fi + echo " Current safe height: $CURRENT_SAFE, waiting for safe height > $TARGET_SAFE_HEIGHT... (elapsed: ${ELAPSED_TIME}s, remaining: ~${REMAINING_TIME}s)" + sleep 10 +done + +# Wait for "decoded" keyword in op-seq logs +echo "⏳ Waiting for 'decoded' keyword in op-seq logs..." +while true; do + if docker logs op-seq 2>&1 | grep -q "decoded"; then + echo "✅ Found 'decoded' keyword in op-seq logs" + break + fi + echo " Waiting for 'decoded' keyword in op-seq logs..." + sleep 5 +done + +#sleep 20 + +# Wait for unsafe - safe < (seq window size * L1 blocktime) +echo "⏳ Waiting for unsafe - safe < 200..." +while true; do + CURRENT_SAFE=$(cast bn -r http://localhost:8123 safe 2>/dev/null || echo "0") + CURRENT_UNSAFE=$(cast bn -r http://localhost:8123 2>/dev/null || echo "0") + if [ "$CURRENT_SAFE" != "0" ] && [ "$CURRENT_UNSAFE" != "0" ] && [ $((CURRENT_UNSAFE - CURRENT_SAFE)) -lt 200 ]; then + echo "✅ Unsafe - safe < 200: unsafe=$CURRENT_UNSAFE, safe=$CURRENT_SAFE" + break + fi + docker compose restart op-seq + sleep 10 +done + +docker compose up -d op-rpc + +while true; do + SAFE_8124=$(cast bn -r http://localhost:8124 safe 2>/dev/null || echo "0") + if [ "$SAFE_8124" != "0" ]; then + # Check for fork at safe height + if ! $SCRIPTS_DIR/check-fork.sh "$SAFE_8124" 2>/dev/null; then + echo "❌ Fork detected at safe height $SAFE_8124, breaking loop" + break + fi + fi + echo " ⏳ Waiting for op-rpc to fork..." + sleep 5 +done + +$SCRIPTS_DIR/find-fork.sh + +exit 0 + PWD_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd $PWD_DIR EXPORT_DIR="$PWD_DIR/data/cannon-data" diff --git a/test/config-op/intent.toml.bak b/test/config-op/intent.toml.bak index 238bc8aa9ea14..b976cfd6c28ac 100644 --- a/test/config-op/intent.toml.bak +++ b/test/config-op/intent.toml.bak @@ -20,7 +20,7 @@ l2ContractsLocator = "file:///app/packages/contracts-bedrock/forge-artifacts" l2GenesisBlockGasLimit = "0xbebc200" l2GenesisBlockBaseFeePerGas = "0x3B9ACA00" l2BlockTime = 1 - sequencerWindowSize = 7200 + sequencerWindowSize = 100 [chains.roles] l1ProxyAdminOwner = "0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266" l2ProxyAdminOwner = "0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266" diff --git a/test/config-op/test.geth.rpc.config.toml b/test/config-op/test.geth.rpc.config.toml index 1350e26ad5084..8f22de625bb99 100644 --- a/test/config-op/test.geth.rpc.config.toml +++ b/test/config-op/test.geth.rpc.config.toml @@ -17,10 +17,7 @@ JWTSecret = "/jwt.txt" MaxPeers = 30 DiscoveryV5 = true # we add both geth and reth to static nodes, but only one is live -StaticNodes = [ - "enode://ef8135659def07b48b54fe2de7d0368e3eaa0a080ef13dde560169357900954be1a1e890b5973a821f9158e512a2da3ff600368f44e18e725a86931eaae5ef64@op-geth-seq:30303", - "enode://ef8135659def07b48b54fe2de7d0368e3eaa0a080ef13dde560169357900954be1a1e890b5973a821f9158e512a2da3ff600368f44e18e725a86931eaae5ef64@op-reth-seq:30303" -] +StaticNodes = [] [Eth] NetworkId = 901 diff --git a/test/docker-compose.yml b/test/docker-compose.yml index 9874145a08423..63455a174ddc8 100644 --- a/test/docker-compose.yml +++ b/test/docker-compose.yml @@ -238,6 +238,7 @@ services: - ./entrypoint:/entrypoint ports: - "8124:8545" + - "8553:8552" - "30304:30303" - "30304:30303/udp" @@ -275,7 +276,7 @@ services: - "9091:9091" # pprof port command: - /app/op-node/bin/op-node - - --log.level=debug + - --log.level=info - --l2=http://op-${SEQ_TYPE}-seq:8552 - --l2.jwt-secret=/jwt.txt - --sequencer.enabled @@ -323,7 +324,7 @@ services: - "9555:9545" command: - /app/op-node/bin/op-node - - --log.level=debug + - --log.level=info - --l2=http://op-${RPC_TYPE}-rpc:8552 - --l2.jwt-secret=/jwt.txt - --sequencer.enabled=false @@ -336,7 +337,7 @@ services: - --p2p.priv.raw=604557d042fbea9ed42f46c0c95c346a932b6a5ef0c0dd07a00dbf95801a2510 - --p2p.peerstore.path=/data/p2p/opnode_peerstore_db - --p2p.discovery.path=/data/p2p/opnode_discovery_db - - --p2p.static=/dns4/op-seq/tcp/9223/p2p/16Uiu2HAkzHdkbmS2VrCsccLibsu7MvGHpmFUMJnMTkKifrtS5m65 + #- --p2p.static=/dns4/op-seq/tcp/9223/p2p/16Uiu2HAkzHdkbmS2VrCsccLibsu7MvGHpmFUMJnMTkKifrtS5m65 - --p2p.no-discovery - --rpc.enable-admin=true - --l1=${L1_RPC_URL_IN_DOCKER} @@ -388,19 +389,19 @@ services: container_name: op-batcher command: - /app/op-batcher/bin/op-batcher - - --log.level=debug + - --log.level=info - --l2-eth-rpc=${OP_BATCHER_L2_ETH_RPC:-http://op-${SEQ_TYPE}-seq:8545} - --rollup-rpc=${OP_BATCHER_ROLLUP_RPC:-http://op-seq:9545} # - --txmgr.enable-cell-proofs=true - --wait-node-sync=true - --check-recent-txs-depth=5 - - --poll-interval=5s + - --poll-interval=1s - --batch-type=1 - --compression-algo=brotli-11 - --data-availability-type=auto - - --max-channel-duration=30 - - --target-num-frames=5 - - --sub-safety-margin=6 + - --max-channel-duration=10 + #- --target-num-frames=5 + - --sub-safety-margin=10 - --num-confirmations=4 - --network-timeout=10s - --safe-abort-nonce-too-low-count=3 @@ -645,7 +646,7 @@ services: retries: 10 start_period: 3s - # Keep op-seq3 to run op-geth as default EL + # Keep op-seq3 to run op-geth as default EL op-seq3: image: "${OP_STACK_IMAGE_TAG}" container_name: op-seq3 diff --git a/test/docs/restart-condition-calculation.md b/test/docs/restart-condition-calculation.md new file mode 100644 index 0000000000000..416949f3ee2af --- /dev/null +++ b/test/docs/restart-condition-calculation.md @@ -0,0 +1,215 @@ +# op-rpc 重启条件计算 + +## 目标 +通过重启 `op-rpc`,使 `FindL2Heads` 回退到合适的 safe head,从而能够接受包含分叉高度的 batch,与 `op-seq` 对齐。 + +## 符号定义 +- `forkHeight`: 分叉 L2 高度 +- `forkTimestamp`: 分叉高度的 timestamp +- `batchTimestamp`: batch 第一个 block 的 timestamp +- `batchHeight`: batch 第一个 block 的 L2 高度 +- `batchBlockCount`: batch 包含的 block 数量 +- `targetSafeHeight`: 目标 safe head 高度 +- `targetSafeNextL1Origin`: `targetSafeHeight + 1` 的 L1 origin number +- `targetUnsafeL1Origin`: 重启时 unsafe head 的 L1 origin number +- `targetUnsafeHeight`: 重启时 unsafe head 的 L2 高度 +- `seqWindowSize`: 序列窗口大小(L1 blocks) + +## 计算步骤 + +### 1. 确定分叉点 +- 分叉高度:`forkHeight` +- 分叉 timestamp:`forkTimestamp` + +### 2. 查找包含分叉高度的 batch +从日志中找到所有 batch,使用以下约束条件筛选出可能包含 `forkTimestamp` 的 batch: +``` +batchTimestamp <= forkTimestamp <= batchTimestamp + batchBlockCount - 1 +``` + +从筛选出的 batch 中选择日志里最后出现的那个,然后通过以下公式将 batch timestamp 转换为 L2 高度: +``` +batchHeight = forkHeight - (forkTimestamp - batchTimestamp) +``` + + +### 3. 确定目标 safe head +通过查询 op-rpc,找到满足以下条件的最小 `targetSafeHeight`: +1. **Height 约束**: + ``` + batchHeight - 1 <= targetSafeHeight <= batchHeight + batchBlockCount - 2 + ``` +2. **SequenceNumber 约束**: + ``` + (targetSafeHeight + 1).SequenceNumber == 0 + ``` + +查询命令: +```bash +cast rpc optimism_outputAtBlock $(cast to-hex ) -r http://localhost:9555 | jq -r '.blockRef' +``` + +查询 `targetSafeHeight + 1` 的 L1 origin number,作为 `targetSafeNextL1Origin`。 + +### 4. 计算重启条件 +`FindL2Heads` 从 unsafe head 向后遍历,找到第一个满足条件的区块 `n`(`n.SequenceNumber == 0`),然后返回 `n` 的父区块作为 safe head。 + +判断条件: +```go +n.L1Origin.Number + seqWindowSize < unsafe_head.L1Origin.Number +``` + +要让 `FindL2Heads` 找到 `targetSafeHeight + 1` 并返回 `targetSafeHeight`,需要 `targetSafeHeight + 1` 是第一个满足条件的区块: +``` +targetSafeNextL1Origin + seqWindowSize = targetUnsafeL1Origin - 1 +``` + +因此,重启条件为: +``` +targetUnsafeL1Origin = targetSafeNextL1Origin + seqWindowSize + 1 +``` + +通过查询 op-rpc,找到最小的 `targetUnsafeHeight`,使得该 unsafe height 的 L1 origin = `targetUnsafeL1Origin`。 + +## 总结 + +1. **确定分叉点**:`forkHeight`, `forkTimestamp` +2. **查找 batch**:从日志中使用约束条件 `batchTimestamp <= forkTimestamp <= batchTimestamp + batchBlockCount - 1` 筛选 batch,选择最后出现的那个,通过公式计算 `batchHeight` +3. **确定目标 safe head**:通过查询 op-rpc,找到 `targetSafeHeight` 满足: + - `batchHeight - 1 <= targetSafeHeight <= batchHeight + batchBlockCount - 2` + - `(targetSafeHeight + 1).SequenceNumber == 0` + - 查询 `targetSafeHeight + 1` 的 L1 origin number,作为 `targetSafeNextL1Origin` +4. **计算重启条件**: + - `targetUnsafeL1Origin = targetSafeNextL1Origin + seqWindowSize + 1` +5. **查找最小的 targetUnsafeHeight**:通过查询 op-rpc,找到最小的 `targetUnsafeHeight`,使得该高度的 L1 origin = `targetUnsafeL1Origin` + +## 示例(本地测试) + +分叉高度:`forkHeight = 8594135` + +### 1. 确定分叉点 +- `forkHeight = 8594135` +- `forkTimestamp = 1762583920`(从 RPC 查询得到:`cast block 8594135 -r http://localhost:8124`) + +### 2. 查找包含分叉高度的 batch +从 `op-rpc.log` 找到最后出现的包含 `forkTimestamp` 的 batch: +- `batchTimestamp = 1762583872` +- `batchBlockCount = 101` + +计算 `batchHeight`: +``` +batchHeight = 8594135 - (1762583920 - 1762583872) = 8594087 +``` + +### 3. 确定目标 safe head +通过查询 op-rpc(端口 9555),找到满足条件的 `targetSafeHeight`: +- 约束:`8594086 <= targetSafeHeight <= 8594186`,且 `(targetSafeHeight + 1).SequenceNumber == 0` +- 查询命令:`cast rpc optimism_outputAtBlock $(cast to-hex ) -r http://localhost:9555 | jq -r '.blockRef | "\(.l1origin.number) \(.sequenceNumber)"'` +- 查询结果:8594087 的 SequenceNumber = 0,L1 origin = 103 +- `targetSafeHeight = 8594086` +- `targetSafeNextL1Origin = 103` + +### 4. 计算重启条件 +``` +targetUnsafeL1Origin = 103 + 100 + 1 = 204 +``` + +### 5. 查找最小的 targetUnsafeHeight +通过查询 op-rpc(端口 9555),找到最小的 `targetUnsafeHeight`,使得该高度的 L1 origin = `targetUnsafeL1Origin`。 + +- 查询命令:`cast rpc optimism_outputAtBlock $(cast to-hex ) -r http://localhost:9555 | jq -r '.blockRef.l1origin.number'` +- 查询结果:8594289 的 L1 origin = 204 +- `targetUnsafeHeight = 8594289` + +**重启时机**:当 `op-rpc` 的 unsafe head 达到高度 8594289 且 L1 origin = 204 时重启。 + + +## 测试网实际计算数据 + +### 输入参数 +- `forkHeight = 12821075` +- `forkTimestamp = 1761279912` +- `batchTimestamp = 1761279302`, `batchBlockCount = 8622` +- `batchHeight = 12820465` + +### 计算结果 +- `targetSafeHeight = 12820474` (timestamp = 1761279311) +- `targetSafeNextL1Origin = 9477640` +- `targetUnsafeL1Origin = 9481241` +- `targetUnsafeHeight = 12863927` + +## 补充:Finalized Height 限制 + +### 问题 +在测试网上,按照以上逻辑计算出 `targetUnsafeHeight` 后重启 `op-rpc`,发现 safe height 并没有回退到 `targetSafeHeight`。 + +**原因**:`FindL2Heads` 在回退时遇到 finalized height,就会停止回退。如果 finalized height 已经超过了 `targetSafeHeight`,safe head 最多只能回退到 finalized height,无法继续回退到 `targetSafeHeight`。 + +相关代码逻辑(`op-node/rollup/sync/start.go:244-248`): +```go +// Don't traverse further than the finalized head to find a safe head +if n.Number == result.Finalized.Number { + lgr.Info("Hit finalized L2 head, returning immediately", ...) + result.Safe = n + return result, nil +} +``` + +### 解决方案 +采用回滚 EL(Execution Layer)的方法: + +1. **启动 op-rpc**,等待 finalized height 达到或超过 `targetSafeHeight` +2. **停止 op-rpc**,此时 EL(op-geth 或 op-reth)会停止出块 +3. **使用工具回滚 EL** 到 `targetSafeHeight`,此操作会同时回滚 finalized height +4. **重启 op-rpc**,`FindL2Heads` 执行后 safe head 将回退到 `targetSafeHeight` +5. **op-rpc 处理 batch** 的逻辑与线上节点保持一致 + +### 回滚 op-geth + +官方提供了 `op-wheel` 工具来回滚 op-geth。该工具通过 RPC 接口调用 EL,原理如下: +1. 调用 `debug_setHead` RPC 接口回滚数据库 +2. 调用 Engine API 的 `forkchoiceUpdated` 接口设置 unsafe、safe 和 finalized 高度 + +**命令示例**: +```bash +op-wheel engine rewind \ + --engine http://localhost:8553 \ + --engine.jwt-secret-path ./config-op/jwt.txt \ + --to 12820474 \ + --set-head \ + --engine.open http://localhost:8124 +``` + +**参数说明**: +- `--engine`: Engine API 端点(需要 JWT 认证) +- `--engine.jwt-secret-path`: JWT 密钥文件路径 +- `--to`: 目标回滚高度 +- `--set-head`: 同时回滚数据库(调用 `debug_setHead`) +- `--engine.open`: 开放的 RPC 端点(用于调用 `debug_setHead`) + +### 回滚 op-reth + +op-reth 没有实现 `debug_setHead` 接口(可以调用,但内部逻辑为空),因此无法使用 `op-wheel` 回滚。但是 reth 本身提供了回滚命令。此时需要先停掉op-reth,然后用如下命令回滚数据库后再启动op-reth + +**命令示例**: +```bash +op-reth stage unwind \ + --datadir=/datadir \ + --chain=/genesis.json \ + --config=/config.toml \ + to-block 12820474 +``` + +**参数说明**: +- `--datadir`: 数据目录路径 +- `--chain`: Genesis 文件路径 +- `--config`: 配置文件路径 +- `to-block`: 目标回滚高度 + +**效果**:数据库会回滚到指定高度,同时 finalized height 也会设置为该高度。 + + + + + + diff --git a/test/docs/sequencer-restart-fork-analysis.md b/test/docs/sequencer-restart-fork-analysis.md new file mode 100644 index 0000000000000..6a18ce166f469 --- /dev/null +++ b/test/docs/sequencer-restart-fork-analysis.md @@ -0,0 +1,241 @@ +# Sequencer Restart Fork Analysis + +## Problem Description + +When `op-seq` is restarted, it causes forks with `op-rpc` nodes. The root cause is related to how `FindL2Heads` recalculates the safe head and how `initialReset` determines the L1 traversal starting point, which always rewinds to a very old L1 block regardless of safe head advancement. + +## Key Observations from Logs + +### op-seq First Startup (Line 30) +- **Initial State**: `safe=8593921`, `safe_origin=22` +- **Reset Origin**: `origin=22` (line 164) +- **L1 Traversal**: Starts from L1 block 22, advances through 23, 24, 25... until L1 block 131 +- **First Batch Found** (line 1558): Found at L1 block 131 + - `stage_origin=131` (batch included in L1 block 131) + - `start_epoch_number=26 end_epoch_number=66` (batch covers L1 blocks 26-66) + - `safe num=8593938` (current safe head when batch is checked) + - **Result**: Batch rejected as "sequence window expired" + - The check is: `startEpochNum + SeqWindowSize < l1InclusionBlock.Number` + - With `startEpochNum=26`, `SeqWindowSize=100` (from config), `l1InclusionBlock.Number=131`: + - `26 + 100 = 126 < 131` → **true** (batch IS expired) + - The batch is correctly rejected because its L1 origin (26) plus the sequence window size (100) is less than the L1 inclusion block (131), meaning the batch was included too late + +### op-rpc Startup (Line 25) +- **Initial State**: `safe=8593921`, `safe_origin=22` +- **Reset Origin**: `origin=22` (line 35) +- **L1 Traversal**: Starts from L1 block 22, advances through 23, 24, 25... until L1 block 131 +- **First Batch Found** (line 200): Found at L1 block 131 + - `stage_origin=131` (batch included in L1 block 131) + - `start_epoch_number=26 end_epoch_number=66` (batch covers L1 blocks 26-66) + - `safe num=8593938` (current safe head when batch is checked) + - **Result**: Batch rejected as "sequence window expired" (same as op-seq) + +- **Second Batch** (line 1563): Found at L1 block 131 + - `start_epoch_number=67 end_epoch_number=120` (batch covers L1 blocks 67-120) + - `batch_timestamp=1762751684`, `nextTimestamp=1762751593` (current safe head's next timestamp) + - **Result**: Batch rejected as "dropping future span batch" + - The check is: `batch.GetTimestamp() > nextTimestamp` + - `1762751684 > 1762751593` → **true** (batch timestamp is in the future) + - This happens because the first batch was dropped, so safe head didn't advance, leaving a gap + +- **Third and Subsequent Batches** (lines 1573, 3181, etc.): All found at L1 blocks 131-134 + - All have `start_epoch_number=121` or higher (covering L1 blocks 121+) + - All have `batch_timestamp` values (1762751792, 1762751793, etc.) greater than `nextTimestamp=1762751593` + - **Result**: All rejected as "dropping future span batch" + - Because the first batch was dropped, safe head remains at 8593938 + - All subsequent batches have timestamps that are too far in the future relative to the current safe head + - This creates a deadlock: batches can't be applied because they're "future", but safe head can't advance because no batches are being applied + +- **Empty Batch Generation** (line 1424): After all batches are dropped + - System starts generating empty batches via `deriveNextEmptyBatch` + - `epoch=22`, `timestamp=1762751576` (matches the expected next timestamp) + - Safe head advances through empty batch derivation (lines 1426, 1432, etc.) + - This allows the system to eventually catch up, but causes forks because `op-rpc` continues processing from its position without restarting + +### op-rpc Processing (No Restart) +- **op-rpc** starts at the same time as `op-seq` first startup (line 25) +- **Initial State**: `safe=8593921`, `safe_origin=22` +- **Reset Origin**: `origin=22` (line 35) +- **L1 Traversal**: Starts from L1 block 22, advances through 23, 24, 25... +- **Batch Processing**: `op-rpc` processes batches continuously without restarting + - First batch found at L1 block 131 (line 200): Same as `op-seq`, rejected as "sequence window expired" + - Second batch found at L1 block 131 (line 205): Same as `op-seq`, rejected as "dropping future span batch" + - **Key Difference**: `op-rpc` continues processing from L1 block 22 onwards, while `op-seq` restarts and rewinds to L1 block 47 + +## Root Cause Analysis + +### 1. FindL2Heads Logic + +When `op-seq` restarts, `FindL2Heads` is called to determine the safe head: + +```go +// From op-node/rollup/sync/start.go:237 +if n.Number <= result.Safe.Number && + n.L1Origin.Number+cfg.SyncLookback() < highestL2WithCanonicalL1Origin.L1Origin.Number && + n.SequenceNumber == 0 { + ready = true +} +``` + +**Behavior on restart:** +When `op-seq` restarts, it calls `FindL2Heads` to determine the safe head. This function performs the following steps: + +1. **Read current forkchoice state from Engine** (line 118 in `start.go`): + The function first queries the execution engine to get the current unsafe, safe, and finalized heads. This initial state is logged as "Loaded current L2 heads". + +2. **Walk back and recalculate safe head** (lines 148-287 in `start.go`): + Starting from the unsafe head, the function walks backward through the L2 chain, verifying each block's L1 origin against the canonical L1 chain. It identifies the first L2 block whose L1 origin is sufficiently old (i.e., its sequence window has closed) and whose sequence number is 0. The parent of this block becomes the recalculated safe head. + +The recalculation is necessary because the L1 chain may have advanced during downtime. As a result, more L2 blocks may now satisfy the sequence window requirement, potentially yielding a higher safe head than what was stored in the Engine. + +**From the logs (op-seq restart, line 5160):** +- **Initial state (read from Engine):** + - unsafe: 8594297, L1 origin: 199 + - safe: 8594084, L1 origin: 103 +- **After FindL2Heads walk-back (line 5487):** + - FindL2Heads finds L2 block 8594073 with L1 origin 98, sequence number 0 + - The condition `n.Number <= result.Safe.Number && n.L1Origin.Number+cfg.SyncLookback() < highestL2WithCanonicalL1Origin.L1Origin.Number && n.SequenceNumber == 0` becomes true + - Safe head is set to parent of this block: 8594072 +- **After recalculation (line 5489):** + - unsafe: 8594297, L1 origin: 199 (unchanged) + - safe: 8594072, L1 origin: 97 (recalculated) + +The walk-back process verified L2 blocks from unsafe head (8594297) down to the recalculated safe head (8594072), confirming that the safe head's L1 origin (97) is sufficiently old relative to the current L1 chain state (199). + +### 2. initialReset Logic + +After `FindL2Heads` determines the new safe head, `initialReset` is called to determine the L1 traversal starting point: + +```go +// From op-node/rollup/derive/pipeline.go:247 +afterChannelTimeout := pipelineL2.L1Origin.Number+spec.ChannelTimeout(pipelineOrigin.Time) > l1Origin.Number +``` + +**Process:** +1. `initialReset` starts from the new safe head (`resetL2Safe`) +2. It walks back the L2 chain while checking: `pipelineL2.L1Origin.Number + ChannelTimeout > safe_head.L1Origin.Number` +3. The walk continues until the condition becomes false or reaches genesis + +**The Logic:** +With `ChannelTimeout = 50` (hardcoded `ChannelTimeoutGranite` since `granite_time: 0`), `initialReset` walks back from the recalculated safe head while the following condition is true: +``` +pipelineL2.L1Origin.Number + 50 > safe_head.L1Origin.Number +``` + +The loop stops when this condition becomes false, i.e., when: +``` +pipelineL2.L1Origin.Number + 50 <= safe_head.L1Origin.Number +``` + +**Why ChannelTimeout is needed:** + +The reason for rewinding 50 L1 blocks (ChannelTimeout) is to ensure **channel completeness**, even though some batches in those channels may be skipped: + +1. **Channel completeness requirement**: A channel can only be read when all its frames are present. The `IsReady()` check requires: + - The last frame (`isLast=true`) must be seen + - All frames from frame 0 to `endFrameNumber` must be present + - Missing any frame prevents the entire channel from being read + +2. **Channels span multiple L1 blocks**: A single channel's frames may be distributed across multiple L1 blocks. For example: + - Frame 0 might be in L1 block 46 + - Frame 1 might be in L1 block 47 + - ... + - Frame 10 (last frame) might be in L1 block 96 + +3. **The problem without rewinding**: If we only start reading from the safe head's L1 origin (e.g., L1 block 96), we would: + - Only read Frame 10 from L1 block 96 + - Miss Frames 0-9 from L1 blocks 46-95 + - Never be able to complete the channel (missing frames) + - Be unable to decode any batches from that channel, even if they need to be processed + +4. **Why read frames even if batches will be skipped**: + - We cannot determine which batches need processing until the entire channel is decoded + - A channel may contain a mix of old batches (to be skipped) and new batches (to be processed) + - The channel must be complete before we can decode and filter batches + - Without all frames, the channel remains incomplete and unusable + +5. **ChannelTimeout = 50**: This represents the maximum number of L1 blocks a channel can span. By rewinding 50 blocks, we ensure we can read all frames of any channel that might still be valid (not timed out). + +**From the logs (op-seq restart, line 5513):** +- Recalculated safe head: 8594072, L1 origin: 97 +- The condition to continue walking: `pipelineL2.L1Origin.Number + 50 > 97`, i.e., `pipelineL2.L1Origin.Number > 47` +- `initialReset` walks back from L2 block 8594072 until it finds an L2 block with L1 origin <= 47 +- It finds an L2 block with L1 origin 47 (47 + 50 = 97 <= 97, condition becomes false) +- Final L1 traversal origin: 47 + +**The Problem:** +Even though the safe head has advanced to L1 origin 97, `initialReset` rewinds the L1 traversal back to L1 block 47 (50 blocks behind the safe head's L1 origin). This causes the pipeline to re-process batches from L1 blocks 47-97 that were already applied, potentially leading to different batch selection than a continuously running node. + +**Comparison with op-rpc:** +- **op-rpc** (line 35): Starts from L1 block 22 (safe head's L1 origin), continues processing without restart +- **op-seq after restart** (line 5513): Rewinds to L1 block 47 (50 blocks behind safe head's L1 origin 97) +- **Result**: `op-seq` re-processes batches from L1 blocks 47-97, while `op-rpc` continues from L1 block 22, causing divergence + +### 3. Why This Causes Forks + +**Scenario:** +1. **op-seq first startup** (line 30): Safe head at 8593921 with L1 origin 22, starts from L1 block 22 +2. **op-rpc startup** (line 25): Safe head at 8593921 with L1 origin 22, starts from L1 block 22 (same as op-seq) +3. **op-seq restart** (line 5160): Safe head recalculated to 8594072 with L1 origin 97, but `initialReset` rewinds to L1 block 47 +4. **Problem**: Starting from L1 block 47 (50 blocks behind safe head's L1 origin 97) means the pipeline will: + - Re-process batches from L1 blocks 47-97 that were already applied + - Potentially find different batches than what `op-rpc` (which didn't restart) would find + - This causes a fork + +**Why op-rpc doesn't fork:** +- `op-rpc` doesn't restart, so it doesn't go through `FindL2Heads` and `initialReset` +- It continues from where it left off, processing batches in order from L1 block 22 +- It doesn't re-process old batches from L1 blocks 47-97 +- When `op-seq` restarts and rewinds to L1 block 47, it may process different batches than `op-rpc`, causing divergence + +## Code Flow + +### FindL2Heads (Startup) +``` +1. Load current L2 heads from Engine +2. Walk back from unsafe head +3. Verify each L2 block's L1 origin against canonical L1 chain +4. Find first block where: L1Origin.Number + SyncLookback < highestL2WithCanonicalL1Origin.L1Origin.Number +5. Set safe head to parent of this block +``` + +### initialReset (Pipeline Reset) +``` +1. Start from resetL2Safe (new safe head from FindL2Heads) +2. Walk back L2 chain while: pipelineL2.L1Origin.Number + ChannelTimeout > safe_head.L1Origin.Number +3. With ChannelTimeout = 50 and safe_head.L1Origin = 20: + - Condition: pipelineL2.L1Origin.Number + 50 > 20 + - Since L1 origin numbers are non-negative, this is always true for valid blocks + - Loop continues until afterL1Genesis becomes false (reaches genesis) + - Stops at L1 origin 20 (the safe head's L1 origin) +4. L1 traversal starts from L1 block 20 +5. Pipeline advances through L1 blocks 20, 21, 22... until finding batches +6. First batch found at L1 block 156, but it's rejected as expired +``` + +## The Issue + +The `initialReset` logic walks back to find an L1 origin old enough to buffer channel data (accounting for `ChannelTimeout = 50` since `granite_time: 0`). + +**The problem**: The condition `pipelineL2.L1Origin.Number + ChannelTimeout > safe_head.L1Origin.Number` causes `initialReset` to always rewind 50 L1 blocks behind the safe head's L1 origin. This means that even when the safe head has advanced significantly (e.g., to L1 origin 97), the pipeline still starts from an old L1 block (e.g., L1 block 47), re-processing batches that were already applied. + +**From the logs:** +1. **op-seq startup**: Safe head at 8593921 with L1 origin 22 +2. **op-seq initialReset**: Walks back to L1 origin 22 (stops at safe head's L1 origin) +3. **op-seq L1 Traversal**: Starts from L1 block 22, advances through 23, 24, 25... until L1 block 131 +4. **op-seq First Batch**: Found at L1 block 131, but rejected as "sequence window expired" + - Batch's `start_epoch_number=26` (covers L1 blocks 26-66) + - Batch included in L1 block 131 + - Check: `startEpochNum + SeqWindowSize < l1InclusionBlock.Number` → `26 + 100 < 131` → `126 < 131` → **true** + - The batch is correctly rejected as expired because `26 + 100 = 126 < 131`, meaning the batch was included too late + +5. **op-seq restart**: Safe head recalculated to 8594072 with L1 origin 97 +6. **op-seq initialReset after restart**: Walks back to L1 origin 47 (97 - 50 = 47) +7. **op-seq L1 Traversal after restart**: Starts from L1 block 47, advances through 48, 49, 50... +8. **op-rpc**: Continues from L1 block 22, processing batches in order without restart + +**Why this causes forks:** +1. **Re-processing of old batches**: `op-seq` restart rewinds to L1 block 47, re-processing batches that were already applied +2. **Different starting points**: `op-seq` starts from L1 block 47 after restart, while `op-rpc` continues from L1 block 22 +3. **Inconsistent batch selection**: `op-seq` may find and process different batches from L1 blocks 47-97 than `op-rpc` would find +4. **Forks**: The sequencer and RPC nodes diverge because they're processing batches from different starting points and may select different batches, leading to different L2 block sequences diff --git a/test/docs/sequencer-window-expiry-recovery.md b/test/docs/sequencer-window-expiry-recovery.md new file mode 100644 index 0000000000000..b27cd48eef0d2 --- /dev/null +++ b/test/docs/sequencer-window-expiry-recovery.md @@ -0,0 +1,572 @@ +# Sequencer Window Expiry Recovery Test: Technical Deep Dive + +## Overview + +This document provides a detailed technical analysis of a test scenario that demonstrates the sequencer window expiry recovery mechanism in Optimism. The test simulates a scenario where `op-batcher` is down for an extended period, causing the sequencer window to expire, and then explores the recovery process. + +## Test Flow Summary + +1. **Start mock L1** - Initialize a local L1 chain +2. **Start op-seq without op-batcher** - Sequencer produces blocks (unsafe head advances), but no batches are submitted to L1, so safe head remains unchanged +3. **Sequencer window expires** - When the gap between unsafe and safe head exceeds `SeqWindowSize`, the sequencer window expires, and safe head starts advancing via empty batch derivation +4. **Start op-batcher** - Batcher submits batches, but they are rejected due to sequence window expiry +5. **Restart op-seq** - System resets and recovers by finding valid batches from earlier L1 blocks +6. **Start op-rpc** - RPC node syncs and may fork from sequencer + +--- + +## Phase 1: Initial Setup - Mock L1 + +The test starts by initializing a mock L1 chain. This is a standard setup step and doesn't require detailed code analysis. + +--- + +## Phase 2: Sequencer Window Expiry - No Batcher Running + +### What Happens + +When `op-seq` and `op-geth-seq` are started but `op-batcher` is not running: +- The sequencer continues to produce L2 blocks (unsafe head advances) +- No batches are submitted to L1 +- Safe head remains unchanged (no new batches to derive from) +- The gap between unsafe and safe head keeps growing +- **When the gap exceeds `SeqWindowSize`**: The sequencer window expires, and `op-node` starts deriving empty batches to advance the safe head + +### Code Analysis: Empty Batch Derivation + +When the sequencer window expires, `op-node` automatically derives empty batches to advance the safe head. This logic is implemented in `op-node/rollup/derive/base_batch_stage.go`: + +```162:206:op-node/rollup/derive/base_batch_stage.go +// deriveNextEmptyBatch may derive an empty batch if the sequencing window is expired +func (bs *baseBatchStage) deriveNextEmptyBatch(ctx context.Context, outOfData bool, parent eth.L2BlockRef) (*SingularBatch, error) { + epoch := bs.l1Blocks[0] + // If the current epoch is too old compared to the L1 block we are at, + // i.e. if the sequence window expired, we create empty batches for the current epoch + expiryEpoch := epoch.Number + bs.config.SeqWindowSize + forceEmptyBatches := (expiryEpoch == bs.origin.Number && outOfData) || expiryEpoch < bs.origin.Number + firstOfEpoch := epoch.Number == parent.L1Origin.Number+1 + nextTimestamp := parent.Time + bs.config.BlockTime + + bs.log.Trace("Potentially generating an empty batch", + "expiryEpoch", expiryEpoch, "forceEmptyBatches", forceEmptyBatches, "nextTimestamp", nextTimestamp, + "epoch_time", epoch.Time, "len_l1_blocks", len(bs.l1Blocks), "firstOfEpoch", firstOfEpoch) + + if !forceEmptyBatches { + // sequence window did not expire yet, still room to receive batches for the current epoch, + // no need to force-create empty batch(es) towards the next epoch yet. + return nil, io.EOF + } + if len(bs.l1Blocks) < 2 { + // need next L1 block to proceed towards + return nil, io.EOF + } + + nextEpoch := bs.l1Blocks[1] + // Fill with empty L2 blocks of the same epoch until we meet the time of the next L1 origin, + // to preserve that L2 time >= L1 time. If this is the first block of the epoch, always generate a + // batch to ensure that we at least have one batch per epoch. + if nextTimestamp < nextEpoch.Time || firstOfEpoch { + bs.log.Info("Generating next batch", "epoch", epoch, "timestamp", nextTimestamp, "parent", parent) + return &SingularBatch{ + ParentHash: parent.Hash, + EpochNum: rollup.Epoch(epoch.Number), + EpochHash: epoch.Hash, + Timestamp: nextTimestamp, + Transactions: nil, + }, nil + } + + // At this point we have auto generated every batch for the current epoch + // that we can, so we can advance to the next epoch. + bs.log.Trace("Advancing internal L1 blocks", "next_timestamp", nextTimestamp, "next_epoch_time", nextEpoch.Time) + bs.l1Blocks = bs.l1Blocks[1:] + return nil, io.EOF +} +``` + +**Key Points:** +- `forceEmptyBatches` is true when `expiryEpoch <= bs.origin.Number`, meaning the current epoch's sequence window has closed +- Empty batches are generated one at a time (each `SingularBatch` contains one L2 block) until the L2 timestamp catches up to the next L1 epoch's time +- The condition `nextTimestamp < nextEpoch.Time || firstOfEpoch` ensures batches are generated until `nextTimestamp >= nextEpoch.Time` + +**Why 12 Empty Blocks Are Generated Together?** + +In the test environment, L1 block time is 2 seconds and L2 block time is 1 second. According to the code logic, each L1 epoch should generate 2 empty L2 blocks (since `nextEpoch.Time - epoch.Time = 2 seconds`). However, the system generates 12 empty blocks together. This requires further investigation into the actual behavior. + +Looking at the code in `deriveNextEmptyBatch`: + +```186:199:op-node/rollup/derive/base_batch_stage.go + nextEpoch := bs.l1Blocks[1] + // Fill with empty L2 blocks of the same epoch until we meet the time of the next L1 origin, + // to preserve that L2 time >= L1 time. If this is the first block of the epoch, always generate a + // batch to ensure that we at least have one batch per epoch. + if nextTimestamp < nextEpoch.Time || firstOfEpoch { + bs.log.Info("Generating next batch", "epoch", epoch, "timestamp", nextTimestamp, "parent", parent) + return &SingularBatch{ + ParentHash: parent.Hash, + EpochNum: rollup.Epoch(epoch.Number), + EpochHash: epoch.Hash, + Timestamp: nextTimestamp, + Transactions: nil, + }, nil + } +``` + +The function generates empty batches in a loop: +1. Each iteration: `nextTimestamp = parent.Time + cfg.BlockTime` (increments by L2 block time, which is 1 second) +2. Condition check: `nextTimestamp < nextEpoch.Time || firstOfEpoch` +3. If true: generate one empty batch (one L2 block) and return +4. The function is called again with the updated `parent` (the newly generated L2 block) +5. This continues until `nextTimestamp >= nextEpoch.Time`, at which point the epoch advances + +**The Key**: The number of empty blocks generated per epoch depends on `nextEpoch.Time - epoch.Time`, which is the **actual timestamp difference** between consecutive L1 blocks. + +If L1 blocks have a 2-second timestamp difference: +- `epoch.Time = T` +- `nextEpoch.Time = T + 2` +- L2 blocks should be generated with timestamps: `T, T+1` (2 blocks) +- When `nextTimestamp = T + 2 >= nextEpoch.Time`, the epoch advances + +**Why 12 blocks are observed**: Even though L1 blocks have a 2-second timestamp difference, the observation of 12 empty blocks being generated together requires further investigation. Possible explanations: + +1. **Fast derivation loop**: When the sequencer window expires, the derivation pipeline may rapidly process multiple epochs in quick succession. If 6 epochs are processed together, each generating 2 blocks, this would result in 12 blocks (6 × 2 = 12). + +2. **First-of-epoch behavior**: The `firstOfEpoch` condition in the code ensures at least one batch per epoch. Combined with rapid epoch processing, this might contribute to the observed pattern. + +The exact mechanism requires further investigation through log analysis to understand when and why 12 blocks are generated together, and whether this is a consistent pattern or specific to certain conditions. + +### Test Script Logic + +The test waits for the safe height to exceed a target value: + +```94:111:test/4-op-start-service.sh +TARGET_SAFE_HEIGHT=8593921 +EXPECTED_WAIT_TIME=200 +START_TIME=$(date +%s) +echo "⏳ Waiting for sequencer window expired and safe height to exceed $TARGET_SAFE_HEIGHT... (expected wait time: ~${EXPECTED_WAIT_TIME}s)" +while true; do + CURRENT_SAFE=$(cast bn -r http://localhost:8123 safe 2>/dev/null || echo "0") + if [ "$CURRENT_SAFE" -gt "$TARGET_SAFE_HEIGHT" ]; then + echo "✅ Safe height reached: $CURRENT_SAFE (target: $TARGET_SAFE_HEIGHT)" + break + fi + ELAPSED_TIME=$(($(date +%s) - START_TIME)) + REMAINING_TIME=$((EXPECTED_WAIT_TIME - ELAPSED_TIME)) + if [ "$REMAINING_TIME" -lt 0 ]; then + REMAINING_TIME=0 + fi + echo " Current safe height: $CURRENT_SAFE, waiting for safe height > $TARGET_SAFE_HEIGHT... (elapsed: ${ELAPSED_TIME}s, remaining: ~${REMAINING_TIME}s)" + sleep 5 +done +``` + +--- + +## Phase 3: Batcher Starts - Batch Rejection Cascade + +### What Happens + +When `op-batcher` starts after the sequencer window has expired: +1. **First batch is rejected** - The batch contains blocks with L1 origins that are too old (exceeding `SeqWindowSize`) +2. **Subsequent batches are rejected** - Because the first batch was dropped, safe head didn't advance, causing timestamp mismatches +3. **System deadlocks** - `op-seq` and `op-batcher` cannot coordinate properly + +### Code Analysis: Sequence Window Expiry Check + +The sequence window expiry check is performed in `op-node/rollup/derive/batches.go`: + +```268:272:op-node/rollup/derive/batches.go + // Filter out batches that were included too late. + if startEpochNum+cfg.SeqWindowSize < l1InclusionBlock.Number { + log.Warn("batch was included too late, sequence window expired") + return BatchDrop, parentBlock + } +``` + +**Why the first batch fails:** +- The batch's `startEpochNum` (L1 origin of the first block in the batch) is old +- The batch was included in L1 block `l1InclusionBlock.Number` +- If `startEpochNum + SeqWindowSize < l1InclusionBlock.Number`, the batch is dropped + +### Code Analysis: Future Batch Check + +After the first batch is dropped, subsequent batches fail the "future batch" check: + +```225:232:op-node/rollup/derive/batches.go + if batch.GetTimestamp() > nextTimestamp { + if cfg.IsHolocene(l1InclusionBlock.Time) { + log.Warn("dropping future span batch", "next_timestamp", nextTimestamp) + return BatchDrop, eth.L2BlockRef{} + } + log.Trace("received out-of-order batch for future processing after next batch", "next_timestamp", nextTimestamp) + return BatchFuture, eth.L2BlockRef{} + } +``` + +**Why subsequent batches fail:** +- `nextTimestamp = l2SafeHead.Time + cfg.BlockTime` (the expected timestamp of the next block) +- Because the first batch was dropped, `l2SafeHead` didn't advance +- The second batch's first block has a timestamp that is ahead of `nextTimestamp` +- This causes the batch to be rejected as a "future batch" + +### The Cascade Effect + +1. **Batch 1**: Contains blocks from L1 origin 6584, included in L1 block 6789 + - Check: `6584 + 200 < 6789` → **Dropped** (sequence window expired) + - Safe head remains at old position + +2. **Batch 2**: Contains blocks starting from where Batch 1 ended + - Check: `batch.GetTimestamp() > nextTimestamp` → **Dropped** (future batch) + - Safe head still hasn't advanced + +3. **Batch 3+**: All subsequent batches fail the same future batch check + +### Test Script Logic + +The test waits for the "decoded" keyword in logs, indicating a batch was processed: + +```115:124:test/4-op-start-service.sh +# Wait for "decoded" keyword in op-seq logs +echo "⏳ Waiting for 'decoded' keyword in op-seq logs..." +while true; do + if docker logs op-seq 2>&1 | grep -q "decoded"; then + echo "✅ Found 'decoded' keyword in op-seq logs" + break + fi + echo " Waiting for 'decoded' keyword in op-seq logs..." + sleep 5 +done +``` + +However, in this scenario, batches are being dropped, so the system cannot recover without intervention. + +--- + +## Phase 4: Sequencer Restart - Recovery Mechanism + +### What Happens + +Restarting `op-seq` triggers a reset process: +1. **FindL2Heads** - Recalculates safe/unsafe heads by walking back the L2 chain +2. **Initial Reset** - Rewinds L1 traversal to an earlier point to start buffering channel data +3. **Batch Processing** - Processes batches from earlier L1 blocks, skipping invalid ones +4. **Recovery** - Eventually finds valid batches and resumes normal operation + +### Code Analysis: FindL2Heads on Startup + +When `op-node` starts, it calls `FindL2Heads` to determine the safe and unsafe heads: + +```103:261:op-node/rollup/sync/start.go +// FindL2Heads walks back from `start` (the previous unsafe L2 block) and finds +// the finalized, unsafe and safe L2 blocks. +// +// - The *unsafe L2 block*: This is the highest L2 block whose L1 origin is a *plausible* +// extension of the canonical L1 chain (as known to the op-node). +// - The *safe L2 block*: This is the highest L2 block whose epoch's sequencing window is +// complete within the canonical L1 chain (as known to the op-node). +// - The *finalized L2 block*: This is the L2 block which is known to be fully derived from +// finalized L1 block data. +// +// Plausible: meaning that the blockhash of the L2 block's L1 origin +// (as reported in the L1 Attributes deposit within the L2 block) is not canonical at another height in the L1 chain, +// and the same holds for all its ancestors. +func FindL2Heads(ctx context.Context, cfg *rollup.Config, l1 L1Chain, l2 L2Chain, lgr log.Logger, syncCfg *Config) (result *FindHeadsResult, err error) { + // Fetch current L2 forkchoice state + result, err = currentHeads(ctx, cfg, l2) + if err != nil { + return nil, fmt.Errorf("failed to fetch current L2 forkchoice state: %w", err) + } + + lgr.Info("Loaded current L2 heads", "unsafe", result.Unsafe, "safe", result.Safe, "finalized", result.Finalized, + "unsafe_origin", result.Unsafe.L1Origin, "safe_origin", result.Safe.L1Origin) + + // ... validation checks ... + + // Current L2 block. + n := result.Unsafe + + var highestL2WithCanonicalL1Origin eth.L2BlockRef // the highest L2 block with confirmed canonical L1 origin + var l1Block eth.L1BlockRef // the L1 block at the height of the L1 origin of the current L2 block n. + var ahead bool // when "n", the L2 block, has a L1 origin that is not visible in our L1 chain source yet + + ready := false // when we found the block after the safe head, and we just need to return the parent block. + bOff := retry.Exponential() + + // Each loop iteration we traverse further from the unsafe head towards the finalized head. + // Once we pass the previous safe head and we have seen at least a full sequence window worth of L1 blocks to confirm, + // then we return the last L2 block of the epoch before that as safe head. + // Each loop iteration we traverse a single L2 block, and we check if the L1 origins are consistent. + for { + // Fetch L1 information if we never had it, or if we do not have it for the current origin. + // Optimization: as soon as we have a previous L1 block, try to traverse L1 by hash instead of by number, to fill the cache. + if n.L1Origin.Hash == l1Block.ParentHash { + b, err := retry.Do(ctx, 5, bOff, func() (eth.L1BlockRef, error) { return l1.L1BlockRefByHash(ctx, n.L1Origin.Hash) }) + if err != nil { + // Exit, find-sync start should start over, to move to an available L1 chain with block-by-number / not-found case. + return nil, fmt.Errorf("failed to retrieve L1 block: %w", err) + } + lgr.Info("Walking back L1Block by hash", "curr", l1Block, "next", b, "l2block", n) + l1Block = b + ahead = false + } else if l1Block == (eth.L1BlockRef{}) || n.L1Origin.Hash != l1Block.Hash { + b, err := retry.Do(ctx, 5, bOff, func() (eth.L1BlockRef, error) { return l1.L1BlockRefByNumber(ctx, n.L1Origin.Number) }) + // if L2 is ahead of L1 view, then consider it a "plausible" head + notFound := errors.Is(err, ethereum.NotFound) + if err != nil && !notFound { + return nil, fmt.Errorf("failed to retrieve block %d from L1 for comparison against %s: %w", n.L1Origin.Number, n.L1Origin.Hash, err) + } + l1Block = b + ahead = notFound + lgr.Info("Walking back L1Block by number", "curr", l1Block, "next", b, "l2block", n) + } + + // ... validation and safe head determination logic ... + + // If the L2 block is at least as old as the previous safe head, and we have seen at least a full sequence window worth of L1 blocks to confirm + if n.Number <= result.Safe.Number && n.L1Origin.Number+cfg.SyncLookback() < highestL2WithCanonicalL1Origin.L1Origin.Number && n.SequenceNumber == 0 { + ready = true + } + + // ... continue traversal ... + } +} +``` + +**Key Points:** +- `FindL2Heads` walks back from the unsafe head, verifying each L2 block's L1 origin against the canonical L1 chain +- The safe head is determined as the highest L2 block whose L1 origin's sequence window has closed +- This process is logged as "Walking back L1Block" in the logs + +### Code Analysis: Initial Reset + +After determining the safe head, the pipeline performs an initial reset: + +```228:265:op-node/rollup/derive/pipeline.go +// initialReset does the initial reset work of finding the L1 point to rewind back to +func (dp *DerivationPipeline) initialReset(ctx context.Context, resetL2Safe eth.L2BlockRef) error { + dp.log.Info("Rewinding derivation-pipeline L1 traversal to handle reset") + + dp.metrics.RecordPipelineReset() + spec := rollup.NewChainSpec(dp.rollupCfg) + + // Walk back L2 chain to find the L1 origin that is old enough to start buffering channel data from. + pipelineL2 := resetL2Safe + l1Origin := resetL2Safe.L1Origin + + pipelineOrigin, err := dp.l1Fetcher.L1BlockRefByHash(ctx, l1Origin.Hash) + if err != nil { + return NewTemporaryError(fmt.Errorf("failed to fetch the new L1 progress: origin: %s; err: %w", pipelineL2.L1Origin, err)) + } + + for { + afterL2Genesis := pipelineL2.Number > dp.rollupCfg.Genesis.L2.Number + afterL1Genesis := pipelineL2.L1Origin.Number > dp.rollupCfg.Genesis.L1.Number + afterChannelTimeout := pipelineL2.L1Origin.Number+spec.ChannelTimeout(pipelineOrigin.Time) > l1Origin.Number + if afterL2Genesis && afterL1Genesis && afterChannelTimeout { + parent, err := dp.l2.L2BlockRefByHash(ctx, pipelineL2.ParentHash) + if err != nil { + return NewResetError(fmt.Errorf("failed to fetch L2 parent block %s", pipelineL2.L2BlockID())) + } + pipelineL2 = parent + pipelineOrigin, err = dp.l1Fetcher.L1BlockRefByHash(ctx, pipelineL2.L1Origin.Hash) + if err != nil { + return NewTemporaryError(fmt.Errorf("failed to fetch the new L1 progress: origin: %s; err: %w", pipelineL2.L1Origin, err)) + } + } else { + break + } + } + + sysCfg, err := dp.l2.SystemConfigByL2Hash(ctx, pipelineL2.Hash) + if err != nil { + return NewTemporaryError(fmt.Errorf("failed to fetch L1 config of L2 block %s: %w", pipelineL2.L2BlockID(), err)) + } + + dp.origin = pipelineOrigin + dp.resetSysConfig = sysCfg + dp.resetL2Safe = resetL2Safe + return nil +} +``` + +**Key Points:** +- The reset walks back the L2 chain to find an L1 origin that is old enough (considering `ChannelTimeout`) to start buffering channel data from +- This ensures the pipeline can read all necessary L2 data from L1 to construct batches after the safe head +- The `dp.origin` is set to this earlier L1 block, which becomes the starting point for L1 traversal + +### Code Analysis: Batch Processing During Recovery + +During recovery, the pipeline processes batches from L1 blocks, skipping invalid ones: + +```233:239:op-node/rollup/derive/batches.go + if batch.GetBlockTimestamp(batch.GetBlockCount()-1) < nextTimestamp { + log.Warn("span batch has no new blocks after safe head") + if cfg.IsHolocene(l1InclusionBlock.Time) { + return BatchPast, eth.L2BlockRef{} + } + return BatchDrop, eth.L2BlockRef{} + } +``` + +**Recovery Process:** +1. Pipeline starts from an earlier L1 block (determined by `initialReset`) +2. As it advances through L1 blocks, it checks each block for batch data +3. Invalid batches (too old, future, etc.) are dropped with warnings +4. Eventually, a valid batch is found that can be applied to the current safe head +5. Once a valid batch is processed, the system resumes normal operation + +### Test Script Logic + +The test waits for `unsafe - safe < 200` and periodically restarts the sequencer: + +```128:139:test/4-op-start-service.sh +# Wait for unsafe - safe < (seq window size * L1 blocktime) +echo "⏳ Waiting for unsafe - safe < 200..." +while true; do + CURRENT_SAFE=$(cast bn -r http://localhost:8123 safe 2>/dev/null || echo "0") + CURRENT_UNSAFE=$(cast bn -r http://localhost:8123 2>/dev/null || echo "0") + if [ "$CURRENT_SAFE" != "0" ] && [ "$CURRENT_UNSAFE" != "0" ] && [ $((CURRENT_UNSAFE - CURRENT_SAFE)) -lt 200 ]; then + echo "✅ Unsafe - safe < 200: unsafe=$CURRENT_UNSAFE, safe=$CURRENT_SAFE" + break + fi + $SCRIPTS_DIR/restart-op-seq.sh + sleep 5 +done +``` + +The restart allows the system to reset and find valid batches from earlier L1 blocks. + +--- + +## Phase 5: RPC Node Startup - Fork Detection + +### What Happens + +When `op-rpc` starts after the sequencer has recovered: +- RPC node quickly syncs by deriving blocks from L1 +- At some point, the RPC node and sequencer may derive different blocks at the same height +- This causes a fork between the two nodes + +### Potential Causes of Fork + +1. **Different L1 Traversal Origins**: The RPC node's `initialReset` may determine a different L1 traversal origin than the sequencer +2. **Different Safe Heads at Startup**: If the RPC node starts with a different safe head, it may derive different empty batches +3. **Timing Differences**: The RPC node may see different L1 blocks or batches at different times + +### Code Analysis: L1 Traversal Origin Determination + +The L1 traversal origin is determined by `initialReset`, which walks back considering `ChannelTimeout`: + +```244:261:op-node/rollup/derive/pipeline.go + for { + afterL2Genesis := pipelineL2.Number > dp.rollupCfg.Genesis.L2.Number + afterL1Genesis := pipelineL2.L1Origin.Number > dp.rollupCfg.Genesis.L1.Number + afterChannelTimeout := pipelineL2.L1Origin.Number+spec.ChannelTimeout(pipelineOrigin.Time) > l1Origin.Number + if afterL2Genesis && afterL1Genesis && afterChannelTimeout { + parent, err := dp.l2.L2BlockRefByHash(ctx, pipelineL2.ParentHash) + if err != nil { + return NewResetError(fmt.Errorf("failed to fetch L2 parent block %s", pipelineL2.L2BlockID())) + } + pipelineL2 = parent + pipelineOrigin, err = dp.l1Fetcher.L1BlockRefByHash(ctx, pipelineL2.L1Origin.Hash) + if err != nil { + return NewTemporaryError(fmt.Errorf("failed to fetch the new L1 progress: origin: %s; err: %w", pipelineL2.L1Origin, err)) + } + } else { + break + } + } +``` + +**Why forks can occur:** +- If the RPC node's safe head at startup differs from the sequencer's safe head, `initialReset` may walk back to a different L1 origin +- This different origin leads to different `bs.l1Blocks[0]` values in `deriveNextEmptyBatch` +- Different epochs result in different empty batch generation, causing a fork + +### Code Analysis: Empty Batch Epoch Selection + +Empty batches use `bs.l1Blocks[0]` as the epoch: + +```162:189:op-node/rollup/derive/base_batch_stage.go +// deriveNextEmptyBatch may derive an empty batch if the sequencing window is expired +func (bs *baseBatchStage) deriveNextEmptyBatch(ctx context.Context, outOfData bool, parent eth.L2BlockRef) (*SingularBatch, error) { + epoch := bs.l1Blocks[0] + // If the current epoch is too old compared to the L1 block we are at, + // i.e. if the sequence window expired, we create empty batches for the current epoch + expiryEpoch := epoch.Number + bs.config.SeqWindowSize + forceEmptyBatches := (expiryEpoch == bs.origin.Number && outOfData) || expiryEpoch < bs.origin.Number + firstOfEpoch := epoch.Number == parent.L1Origin.Number+1 + nextTimestamp := parent.Time + bs.config.BlockTime + + bs.log.Trace("Potentially generating an empty batch", + "expiryEpoch", expiryEpoch, "forceEmptyBatches", forceEmptyBatches, "nextTimestamp", nextTimestamp, + "epoch_time", epoch.Time, "len_l1_blocks", len(bs.l1Blocks), "firstOfEpoch", firstOfEpoch) + + if !forceEmptyBatches { + // sequence window did not expire yet, still room to receive batches for the current epoch, + // no need to force-create empty batch(es) towards the next epoch yet. + return nil, io.EOF + } + if len(bs.l1Blocks) < 2 { + // need next L1 block to proceed towards + return nil, io.EOF + } + + nextEpoch := bs.l1Blocks[1] + // Fill with empty L2 blocks of the same epoch until we meet the time of the next L1 origin, + // to preserve that L2 time >= L1 time. If this is the first block of the epoch, always generate a + // batch to ensure that we at least have one batch per epoch. + if nextTimestamp < nextEpoch.Time || firstOfEpoch { + bs.log.Info("Generating next batch", "epoch", epoch, "timestamp", nextTimestamp, "parent", parent) + return &SingularBatch{ + ParentHash: parent.Hash, + EpochNum: rollup.Epoch(epoch.Number), + EpochHash: epoch.Hash, + Timestamp: nextTimestamp, + Transactions: nil, + }, nil + } +``` + +**Fork Mechanism:** +- If `bs.l1Blocks[0]` differs between sequencer and RPC, they will generate empty batches with different `EpochNum` and `EpochHash` +- This causes blocks at the same L2 height to have different hashes, resulting in a fork + +### Test Script Logic + +The test checks for forks at the safe height: + +```143:159:test/4-op-start-service.sh +# Wait for op-rpc to be ready +echo "⏳ Waiting for op-rpc to be ready..." +while true; do + SAFE_8124=$(cast bn -r http://localhost:8124 safe 2>/dev/null || echo "0") + if [ "$SAFE_8124" != "0" ]; then + # Check for fork at safe height + if ! $SCRIPTS_DIR/check-fork.sh "$SAFE_8124" 2>/dev/null; then + echo "❌ Fork detected at safe height $SAFE_8124, breaking loop" + break + fi + fi + echo " Waiting for op-rpc to be ready..." + sleep 5 +done + +$SCRIPTS_DIR/find-fork.sh +``` + +The `find-fork.sh` script performs a binary search to locate the exact fork point. + +--- + +## Summary + +This test demonstrates a critical recovery scenario in Optimism: + +1. **Sequencer Window Expiry**: When the batcher is down, the sequencer window expires, and safe head advances via empty batch derivation +2. **Batch Rejection Cascade**: When the batcher resumes, batches are rejected due to sequence window expiry and future batch checks +3. **Recovery via Reset**: Restarting the sequencer triggers a reset that finds valid batches from earlier L1 blocks +4. **Fork Potential**: Different nodes may derive different blocks due to different L1 traversal origins, causing forks + +The recovery mechanism relies on the `initialReset` function walking back the L2 chain to find an appropriate L1 origin to start from, allowing the system to skip invalid batches and eventually find valid ones. + diff --git a/test/example.env b/test/example.env index 974dff554267f..5be65b1964a1f 100644 --- a/test/example.env +++ b/test/example.env @@ -80,7 +80,7 @@ NEW_BLOCK_HASH=0xddb9bdc86631494bab4b4749c4575035e2383da7c96d32d31341de862b1dd6c DB_ENGINE="pebble" -CONDUCTOR_ENABLED=true +CONDUCTOR_ENABLED=false LAUNCH_RPC_NODE=true OWNER_TYPE=transactor # safe diff --git a/test/scripts/check-block-txs.sh b/test/scripts/check-block-txs.sh new file mode 100755 index 0000000000000..0507389681bbb --- /dev/null +++ b/test/scripts/check-block-txs.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Script: Check transaction count for each block in specified range +# If transaction count > 1, print block number + +START_HEIGHT=8593921 +RPC_URL="http://localhost:8123" + +# Get latest safe height +echo "Getting latest safe height..." +LATEST_SAFE=$(cast block-number --rpc-url "$RPC_URL" safe 2>/dev/null) + +if [ $? -ne 0 ] || [ -z "$LATEST_SAFE" ]; then + echo "Error: Failed to get safe height" + exit 1 +fi + +echo "Start height: $START_HEIGHT" +echo "Latest safe height: $LATEST_SAFE" +echo "Starting block check..." +echo "" + +# Check transaction count for each block +current=$START_HEIGHT +count=0 + +while [ $current -le $LATEST_SAFE ]; do + # Get transaction count for the block + tx_count=$(cast block "$current" --rpc-url "$RPC_URL" --json 2>/dev/null | jq -r '.transactions | length' 2>/dev/null) + + if [ -z "$tx_count" ] || [ "$tx_count" = "null" ]; then + echo "Warning: Failed to get transaction count for block $current, skipping" + ((current++)) + continue + fi + + # If transaction count > 1, print block number + if [ "$tx_count" -gt 1 ]; then + echo "Block $current: $tx_count transactions" + ((count++)) + fi + + # Show progress every 1000 blocks + if [ $((current % 1000)) -eq 0 ]; then + echo "Progress: $current / $LATEST_SAFE (found $count blocks)" + fi + + ((current++)) +done + +echo "" +echo "Check completed!" +echo "Found $count blocks with transaction count > 1" + diff --git a/test/scripts/check-fork.sh b/test/scripts/check-fork.sh new file mode 100755 index 0000000000000..06b1b796f5789 --- /dev/null +++ b/test/scripts/check-fork.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Check if there's a fork between two RPC endpoints +# Usage: check-fork.sh [rpc1_url] [rpc2_url] +# rpc1_url: first RPC endpoint (default: http://localhost:8123) +# rpc2_url: second RPC endpoint (default: http://localhost:8124) +# Returns: 0 if no fork, 1 if fork detected, 2 if error + +RPC1=${1:-http://localhost:8123} +RPC2=${2:-http://localhost:8124} + +# Get latest block heights from both RPCs +HEIGHT1=$(cast bn -r $RPC1 2>/dev/null) +HEIGHT2=$(cast bn -r $RPC2 2>/dev/null) + +if [ -z "$HEIGHT1" ] || [ "$HEIGHT1" = "0" ] || [ -z "$HEIGHT2" ] || [ "$HEIGHT2" = "0" ]; then + exit 2 +fi + +# Use the smaller height +if [ "$HEIGHT1" -le "$HEIGHT2" ]; then + HEIGHT=$HEIGHT1 +else + HEIGHT=$HEIGHT2 +fi + +# Get block hashes at the common height +HASH1=$(cast block $HEIGHT -r $RPC1 --json 2>/dev/null | jq -r '.hash' 2>/dev/null) +HASH2=$(cast block $HEIGHT -r $RPC2 --json 2>/dev/null | jq -r '.hash' 2>/dev/null) + +if [ -z "$HASH1" ] || [ -z "$HASH2" ] || [ "$HASH1" = "null" ] || [ "$HASH2" = "null" ]; then + exit 2 +fi + +if [ "$HASH1" = "$HASH2" ]; then + echo "✅ No fork detected at height $HEIGHT" + exit 0 +else + echo "❌ Fork detected at height $HEIGHT" + echo " RPC1 ($RPC1): $HASH1" + echo " RPC2 ($RPC2): $HASH2" + exit 1 +fi diff --git a/test/scripts/control-sequencer.sh b/test/scripts/control-sequencer.sh new file mode 100755 index 0000000000000..bdf4c2cc4d48b --- /dev/null +++ b/test/scripts/control-sequencer.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# Script: Control op-node sequencer start and stop +# Usage: ./control_sequencer.sh [stop|start|status] [RPC_URL] [BLOCK_HASH] + +OP_NODE_RPC=${2:-"http://localhost:9545"} +ACTION=${1:-"status"} + +# Check if cast is installed +if ! command -v cast &> /dev/null; then + echo "Error: cast (Foundry) is required" + echo "Installation: https://book.getfoundry.sh/getting-started/installation" + exit 1 +fi + +echo "=== Control op-node Sequencer ===" +echo "OP-Node RPC: $OP_NODE_RPC" +echo "" + +case "$ACTION" in + stop) + echo "Stopping sequencer..." + result=$(cast rpc admin_stopSequencer --rpc-url "$OP_NODE_RPC" 2>/dev/null) + + if [ $? -eq 0 ] && [ -n "$result" ]; then + echo "Sequencer stopped" + echo "Latest block hash: $result" + else + echo "Error: Failed to stop sequencer or sequencer is already stopped" + exit 1 + fi + ;; + + start) + # Check current status first + echo "Checking sequencer current status..." + is_active=$(cast rpc admin_sequencerActive --rpc-url "$OP_NODE_RPC" 2>/dev/null) + + if [ $? -ne 0 ]; then + echo "Error: Failed to connect to RPC" + exit 1 + fi + + if [ "$is_active" = "true" ]; then + echo "Sequencer is already running, no need to start" + exit 0 + fi + + echo "Sequencer is currently stopped, preparing to start..." + + sync_status=$(cast rpc optimism_syncStatus --rpc-url "$OP_NODE_RPC" 2>/dev/null) + if [ $? -eq 0 ]; then + BLOCK_HASH=$(echo "$sync_status" | jq -r '.unsafe_l2.hash' 2>/dev/null) + if [ -n "$BLOCK_HASH" ] && [ "$BLOCK_HASH" != "null" ]; then + echo "Got unsafe_l2 hash from syncStatus: $BLOCK_HASH" + fi + fi + + echo "Using block hash: $BLOCK_HASH" + echo "Starting sequencer..." + + result=$(cast rpc admin_startSequencer "$BLOCK_HASH" --rpc-url "$OP_NODE_RPC" 2>&1) + + if [ $? -eq 0 ]; then + echo "Sequencer started successfully" + else + echo "Error: Failed to start sequencer" + echo "$result" + echo "" + fi + ;; + + status) + echo "Querying sequencer status..." + is_active=$(cast rpc admin_sequencerActive --rpc-url "$OP_NODE_RPC" 2>/dev/null) + + if [ $? -eq 0 ]; then + if [ "$is_active" = "true" ]; then + echo "Sequencer status: Running" + else + echo "Sequencer status: Stopped" + fi + else + echo "Error: Failed to connect to RPC" + exit 1 + fi + ;; + + *) + echo "Usage: $0 [stop|start|status] [OP_NODE_RPC] [BLOCK_HASH] [L2_RPC]" + echo "" + echo "Examples:" + echo " $0 stop # Stop sequencer" + echo " $0 start # Start sequencer (auto-get latest block hash)" + echo " $0 start # Start sequencer (use specified block hash)" + echo " $0 start # Start sequencer (specify L2 RPC URL for getting block hash)" + echo " $0 status # Query sequencer status" + echo "" + echo "Default OP-Node RPC URL: http://localhost:9545" + echo "Default L2 RPC URL: http://localhost:8545" + exit 1 + ;; +esac + diff --git a/test/scripts/find-fork.sh b/test/scripts/find-fork.sh new file mode 100755 index 0000000000000..6df7d2beb704e --- /dev/null +++ b/test/scripts/find-fork.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +# Find the fork point between op-rpc and op-seq +# Using binary search + +RPC1="http://localhost:8123" +RPC2="http://localhost:8124" +START_HEIGHT=8593921 + +# Get safe height from both nodes +SAFE1=$(cast bn -r $RPC1 safe) +SAFE2=$(cast bn -r $RPC2 safe) +MIN_SAFE=$((SAFE1 < SAFE2 ? SAFE1 : SAFE2)) + +echo "=== Finding Fork Point ===" +echo "RPC1 safe height: $SAFE1" +echo "RPC2 safe height: $SAFE2" +echo "Min safe height: $MIN_SAFE" +echo "Start height: $START_HEIGHT" +echo "" + +# Compare safe block hashes at min safe height (the lowest safe height both nodes have reached) +MIN_SAFE_HASH1=$(cast block $MIN_SAFE -r $RPC1 --json 2>/dev/null | jq -r '.hash' 2>/dev/null) +MIN_SAFE_HASH2=$(cast block $MIN_SAFE -r $RPC2 --json 2>/dev/null | jq -r '.hash' 2>/dev/null) + +if [ -z "$MIN_SAFE_HASH1" ] || [ -z "$MIN_SAFE_HASH2" ]; then + echo " Error: Failed to get min safe block hash" + exit 1 +fi + +if [ "$MIN_SAFE_HASH1" = "$MIN_SAFE_HASH2" ]; then + echo " ✅ No fork at safe height, both nodes have same safe block hash" + echo " Safe block hash at height $MIN_SAFE: $MIN_SAFE_HASH1" + echo "" + echo "No fork detected at safe height. Exiting." + exit 0 +fi + +# Binary search +left=$START_HEIGHT +right=$MIN_SAFE +fork_height=$START_HEIGHT + +while [ $left -le $right ]; do + mid=$(((left + right) / 2)) + + # Get block hash from both nodes + hash1=$(cast block $mid -r $RPC1 --json 2>/dev/null | jq -r '.hash' 2>/dev/null) + hash2=$(cast block $mid -r $RPC2 --json 2>/dev/null | jq -r '.hash' 2>/dev/null) + + if [ -z "$hash1" ] || [ -z "$hash2" ]; then + echo " Error: Failed to get hash for block $mid" + break + fi + + if [ "$hash1" = "$hash2" ]; then + fork_height=$mid + left=$((mid + 1)) + else + right=$((mid - 1)) + fi +done + +# Verify fork point +echo "" +echo "=== Verifying Fork Point ===" +echo "Block $fork_height (last matching):" +hash1=$(cast block $fork_height -r $RPC1 --json 2>/dev/null | jq -r '.hash') +hash2=$(cast block $fork_height -r $RPC2 --json 2>/dev/null | jq -r '.hash') +echo " RPC1: $hash1" +echo " RPC2: $hash2" +if [ "$hash1" = "$hash2" ]; then + echo " ✅ Match" +else + echo " ❌ Mismatch" +fi + +fork_block=$((fork_height + 1)) +if [ $fork_block -le $MIN_SAFE ]; then + echo "" + echo "Block $fork_block (first fork):" + hash1=$(cast block $fork_block -r $RPC1 --json 2>/dev/null | jq -r '.hash') + hash2=$(cast block $fork_block -r $RPC2 --json 2>/dev/null | jq -r '.hash') + echo " RPC1: $hash1" + echo " RPC2: $hash2" + if [ "$hash1" = "$hash2" ]; then + echo " ✅ Still matching" + else + echo " ❌ Fork confirmed" + fi +fi + diff --git a/test/scripts/reset-rpc.sh b/test/scripts/reset-rpc.sh new file mode 100755 index 0000000000000..b5971868cdc90 --- /dev/null +++ b/test/scripts/reset-rpc.sh @@ -0,0 +1,8 @@ +docker compose down op-rpc +docker compose down op-geth-rpc + +cd data + +rm -rf op-rpc +rm -rf op-geth-rpc +cp -r op-geth-rpc-bak op-geth-rpc diff --git a/test/scripts/restart-op-rpc-on-target-height.sh b/test/scripts/restart-op-rpc-on-target-height.sh new file mode 100755 index 0000000000000..3d12a48ef44d1 --- /dev/null +++ b/test/scripts/restart-op-rpc-on-target-height.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Check if the unsafe head reaches the target height, then restart op-rpc +# Usage: restart-op-rpc-on-target-height.sh +# This script will loop continuously until the unsafe head reaches target height, then restart op-rpc + +TARGET_UNSAFE_HEIGHT=8594289 +PRE=$((TARGET_UNSAFE_HEIGHT - 1000)) +RPC_URL="http://localhost:8124" + +echo "⏳ Waiting for unsafe head to reach height $TARGET_UNSAFE_HEIGHT..." + +echo $PRE + +# Loop until condition is met +while true; do + # Get current unsafe height + CURRENT_UNSAFE_HEIGHT=$(cast bn -r $RPC_URL 2>/dev/null) + if [ -z "$CURRENT_UNSAFE_HEIGHT" ] || [ "$CURRENT_UNSAFE_HEIGHT" = "0" ]; then + echo " Waiting for RPC to be ready... (current: unavailable)" + sleep 1 + continue + fi + + # Check if unsafe height reaches target + if [ "$CURRENT_UNSAFE_HEIGHT" -ge "$TARGET_UNSAFE_HEIGHT" ]; then + echo "✅ Unsafe head reached target height: $CURRENT_UNSAFE_HEIGHT" + docker compose restart op-rpc + exit 0 + fi + + echo " Current unsafe height: $CURRENT_UNSAFE_HEIGHT (target: >= $TARGET_UNSAFE_HEIGHT)" + + if [ "$CURRENT_UNSAFE_HEIGHT" -lt "$PRE" ]; then + sleep 1 + fi +done + diff --git a/test/scripts/send-and-record.sh b/test/scripts/send-and-record.sh new file mode 100755 index 0000000000000..5f1bd870fea82 --- /dev/null +++ b/test/scripts/send-and-record.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Script: Continuously send transactions and record hashes to hash.txt + +HASH_FILE="hash.txt" + +rm -rf "$HASH_FILE" + +# Ensure hash.txt file exists +touch "$HASH_FILE" + +echo "Starting to send transactions and record hashes..." +echo "Press Ctrl+C to stop" + +while true; do + # Send transaction and get hash + HASH=$(cast send 0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266 \ + --private-key=0x4bbbf85ce3377467afe5d46f804f221813b2bb87f24d81f60f1fcdbf7cbf4356 \ + --value=1 \ + -r http://localhost:8123 \ + --json 2>/dev/null | jq -r .transactionHash) + + # Check if hash was successfully obtained + if [ -n "$HASH" ] && [ "$HASH" != "null" ]; then + echo "$HASH" >> "$HASH_FILE" + echo "Recorded hash: $HASH" + else + echo "Error: Failed to get transaction hash" + fi + + # Optional: Add short delay to avoid sending too fast + sleep 1 +done +