diff --git a/go-runner/overlay/benchmark1.24.0.go b/go-runner/overlay/benchmark1.24.0.go
index fd4c0fa..b2a0eeb 100644
--- a/go-runner/overlay/benchmark1.24.0.go
+++ b/go-runner/overlay/benchmark1.24.0.go
@@ -356,45 +356,7 @@ func (b *B) launch() {
 				b.runN(b.benchTime.n)
 			}
 		} else {
-			warmupD := b.benchTime.d / 10
-			warmupN := int64(1)
-			for n := int64(1); !b.failed && b.duration < warmupD && n < 1e9; {
-				last := n
-				// Predict required iterations.
-				goalns := warmupD.Nanoseconds()
-				prevIters := int64(b.N)
-				n = int64(predictN(goalns, prevIters, b.duration.Nanoseconds(), last))
-				b.runN(int(n))
-				warmupN = n
-			}
-
-			// Reset the fields from the warmup run
-			b.ResetTimer()
-
-			// Final run:
-			benchD := b.benchTime.d
-			benchN := predictN(benchD.Nanoseconds(), int64(b.N), b.duration.Nanoseconds(), warmupN)
-
-			// When we have a very slow benchmark (e.g. taking 500ms), we have to:
-			// 1. Reduce the number of rounds to not slow down the process (e.g. by executing a 1s bench 100 times)
-			// 2. Not end up with roundN of 0 when dividing benchN (which can be < 100) by rounds
-			const minRounds = 100
-			var rounds int
-			var roundN int
-			if benchN < minRounds {
-				rounds = benchN
-				roundN = 1
-			} else {
-				rounds = minRounds
-				roundN = benchN / int(rounds)
-			}
-
-			b.codspeed.instrument_hooks.StartBenchmark()
-			for range rounds {
-				b.runN(int(roundN))
-			}
-			b.codspeed.instrument_hooks.StopBenchmark()
-			b.sendAccumulatedTimestamps()
+			runBenchmarkWithWarmup(b)
 		}
 	}
 	b.result = BenchmarkResult{b.N, b.duration, b.bytes, b.netAllocs, b.netBytes, b.codspeedTimePerRoundNs, b.codspeedItersPerRound, b.extra}
@@ -468,6 +430,7 @@ func (b *B) loopSlowPath() bool {
 		}
 		// Within a b.Loop loop, we don't use b.N (to avoid confusion).
 		b.N = 0
+		b.loopStartTime = time.Now()
 		b.codspeed.instrument_hooks.StartBenchmark()
 		b.ResetTimer()
 		b.StartTimerWithoutMarker()
@@ -489,7 +452,7 @@ func (b *B) loopSlowPath() bool {
 		more = false
 	} else {
 		// Handle fixed time case
-		more = b.stopOrScaleBLoop()
+		more = b.stopOrScaleBLoopCodspeed()
 	}
 	if !more {
 		// NOTE: We could move the endTimestamp capturing further up or even into the Loop() function
diff --git a/go-runner/overlay/benchmark1.24.0.patch b/go-runner/overlay/benchmark1.24.0.patch
index d24fcf3..aed1c17 100644
--- a/go-runner/overlay/benchmark1.24.0.patch
+++ b/go-runner/overlay/benchmark1.24.0.patch
@@ -1,5 +1,5 @@
---- benchmark.go.1.24	2026-01-09 11:59:54.625882898 +0100
-+++ overlay/benchmark1.24.go	2026-01-09 12:30:27.328634216 +0100
+--- benchmark1.24.0.go	2026-01-16 16:35:37.898143679 +0100
++++ overlay/benchmark1.24.0.go	2026-01-16 16:30:04.538059278 +0100
 @@ -93,6 +93,7 @@
  // affecting benchmark results.
  type B struct {
@@ -8,8 +8,30 @@
  	importPath       string // import path of the package containing the benchmark
  	bstate           *benchState
  	N                int
-@@ -132,31 +133,24 @@
- 	}
+@@ -114,34 +115,42 @@
+ 	netBytes  uint64
+ 	// Extra metrics collected by ReportMetric.
+ 	extra map[string]float64
+-	// For Loop() to be executed in benchFunc.
+-	// Loop() has its own control logic that skips the loop scaling.
+-	// See issue #61515.
+-	loopN int
++
++	// loop tracks the state of B.Loop
++	loop struct {
++		// n is the target number of iterations. It gets bumped up as we go.
++		// When the benchmark loop is done, we commit this to b.N so users can
++		// do reporting based on it, but we avoid exposing it until then.
++		n uint64
++		// i is the current Loop iteration. It's strictly monotonically
++		// increasing toward n.
++		//
++		// The high bit is used to poison the Loop fast path and fall back to
++		// the slow path.
++		i uint64
++
++		done bool // set when B.Loop return false
++	}
  }
 
 -// StartTimer starts timing a test. This function is called automatically
@@ -22,7 +44,6 @@
 -		b.startBytes = memStats.TotalAlloc
 -		b.start = highPrecisionTimeNow()
 -		b.timerOn = true
--		b.loop.i &^= loopPoisonTimer
 +	timerOn := b.timerOn
 +
 +	b.StartTimerWithoutMarker()
@@ -41,8 +62,6 @@
 -		b.netAllocs += memStats.Mallocs - b.startAllocs
 -		b.netBytes += memStats.TotalAlloc - b.startBytes
 -		b.timerOn = false
--		// If we hit B.Loop with the timer stopped, fail.
--		b.loop.i |= loopPoisonTimer
 +	endTimestamp := CurrentTimestamp()
 +	timerOn := b.timerOn
 +
@@ -53,7 +72,7 @@
  	}
  }
 
-@@ -176,10 +170,18 @@
+@@ -161,10 +170,18 @@
  		b.startAllocs = memStats.Mallocs
  		b.startBytes = memStats.TotalAlloc
  		b.start = highPrecisionTimeNow()
@@ -72,7 +91,7 @@
  }
 
  // SetBytes records the number of bytes processed in a single operation.
-@@ -195,6 +197,11 @@
+@@ -180,6 +197,11 @@
 
  // runN runs a single benchmark for the specified number of iterations.
  func (b *B) runN(n int) {
@@ -84,15 +103,32 @@
  	benchmarkLock.Lock()
  	defer benchmarkLock.Unlock()
  	ctx, cancelCtx := context.WithCancel(context.Background())
-@@ -218,6 +225,7 @@
+@@ -192,7 +214,9 @@
+ 	runtime.GC()
+ 	b.resetRaces()
+ 	b.N = n
+-	b.loopN = 0
++	b.loop.n = 0
++	b.loop.i = 0
++	b.loop.done = false
+ 	b.ctx = ctx
+ 	b.cancelCtx = cancelCtx
+
+@@ -201,8 +225,13 @@
  	b.StartTimer()
  	b.benchFunc(b)
  	b.StopTimer()
 +	b.SaveMeasurement()
  	b.previousN = n
  	b.previousDuration = b.duration
++
++	if b.loop.n > 0 && !b.loop.done && !b.failed {
++		b.Error("benchmark function returned without B.Loop() == false (break or return in loop?)")
++	}
+ }
 
-@@ -246,6 +254,8 @@
+ // run1 runs the first iteration of benchFunc. It reports whether more
+@@ -225,6 +254,8 @@
  	}()
  	<-b.signal
  	if b.failed {
@@ -101,7 +137,7 @@
  		fmt.Fprintf(b.w, "%s--- FAIL: %s\n%s", b.chatty.prefix(), b.name, b.output)
  		return false
  	}
-@@ -274,6 +284,8 @@
+@@ -253,6 +284,8 @@
  // subbenchmarks. b must not have subbenchmarks.
  func (b *B) run() {
  	labelsOnce.Do(func() {
@@ -110,52 +146,31 @@
  		fmt.Fprintf(b.w, "goos: %s\n", runtime.GOOS)
  		fmt.Fprintf(b.w, "goarch: %s\n", runtime.GOARCH)
  		if b.importPath != "" {
-@@ -344,18 +356,48 @@
+@@ -312,8 +345,8 @@
+ 	}()
+
+ 	// b.Loop does its own ramp-up logic so we just need to run it once.
+-	// If b.loopN is non zero, it means b.Loop has already run.
+-	if b.loopN == 0 {
++	// If b.loop.n is non zero, it means b.Loop has already run.
++	if b.loop.n == 0 {
+ 		// Run the benchmark for at least the specified amount of time.
+ 		if b.benchTime.n > 0 {
+ 			// We already ran a single iteration in run1.
+@@ -323,18 +356,10 @@
  				b.runN(b.benchTime.n)
  			}
  		} else {
 -			d := b.benchTime.d
 -			for n := int64(1); !b.failed && b.duration < d && n < 1e9; {
-+			warmupD := b.benchTime.d / 10
-+			warmupN := int64(1)
-+			for n := int64(1); !b.failed && b.duration < warmupD && n < 1e9; {
- 				last := n
- 				// Predict required iterations.
+-				last := n
+-				// Predict required iterations.
 -				goalns := d.Nanoseconds()
-+				goalns := warmupD.Nanoseconds()
- 				prevIters := int64(b.N)
- 				n = int64(predictN(goalns, prevIters, b.duration.Nanoseconds(), last))
- 				b.runN(int(n))
-+				warmupN = n
-+			}
-+
-+			// Reset the fields from the warmup run
-+			b.ResetTimer()
-+
-+			// Final run:
-+			benchD := b.benchTime.d
-+			benchN := predictN(benchD.Nanoseconds(), int64(b.N), b.duration.Nanoseconds(), warmupN)
-+
-+			// When we have a very slow benchmark (e.g. taking 500ms), we have to:
-+			// 1. Reduce the number of rounds to not slow down the process (e.g. by executing a 1s bench 100 times)
-+			// 2. Not end up with roundN of 0 when dividing benchN (which can be < 100) by rounds
-+			const minRounds = 100
-+			var rounds int
-+			var roundN int
-+			if benchN < minRounds {
-+				rounds = benchN
-+				roundN = 1
-+			} else {
-+				rounds = minRounds
-+				roundN = benchN / int(rounds)
-+			}
-+
-+			b.codspeed.instrument_hooks.StartBenchmark()
-+			for range rounds {
-+				b.runN(int(roundN))
- 			}
-+			b.codspeed.instrument_hooks.StopBenchmark()
-+			b.sendAccumulatedTimestamps()
+-				prevIters := int64(b.N)
+-				n = int64(predictN(goalns, prevIters, b.duration.Nanoseconds(), last))
+-				b.runN(int(n))
+-			}
++			runBenchmarkWithWarmup(b)
  		}
  	}
 -	b.result = BenchmarkResult{b.N, b.duration, b.bytes, b.netAllocs, b.netBytes, b.extra}
@@ -163,44 +178,50 @@
  }
 
  // Elapsed returns the measured elapsed time of the benchmark.
-@@ -391,11 +433,7 @@
+@@ -368,42 +393,93 @@
+ }
+
  func (b *B) stopOrScaleBLoop() bool {
- 	t := b.Elapsed()
- 	if t >= b.benchTime.d {
+-	timeElapsed := highPrecisionTimeSince(b.start)
+-	if timeElapsed >= b.benchTime.d {
 -		// Stop the timer so we don't count cleanup time
 -		b.StopTimer()
--		// Commit iteration count
--		b.N = int(b.loop.n)
--		b.loop.done = true
++	t := b.Elapsed()
++	if t >= b.benchTime.d {
 +		// We've reached the target
  		return false
  	}
  	// Loop scaling
-@@ -407,45 +445,78 @@
- 		// in big trouble.
- 		panic("loop iteration target overflow")
- 	}
--	b.loop.i++
+ 	goalns := b.benchTime.d.Nanoseconds()
+-	prevIters := int64(b.N)
+-	b.N = predictN(goalns, prevIters, timeElapsed.Nanoseconds(), prevIters)
+-	b.loopN++
++	prevIters := int64(b.loop.n)
++	b.loop.n = uint64(predictN(goalns, prevIters, t.Nanoseconds(), prevIters))
++	if b.loop.n&loopPoisonMask != 0 {
++		// The iteration count should never get this high, but if it did we'd be
++		// in big trouble.
++		panic("loop iteration target overflow")
++	}
  	return true
  }
 
  func (b *B) loopSlowPath() bool {
- 	// Consistency checks
--	if !b.timerOn {
--		b.Fatal("B.Loop called with timer stopped")
--	}
+-	if b.loopN == 0 {
+-		// If it's the first call to b.Loop() in the benchmark function.
+-		// Allows more precise measurement of benchmark loop cost counts.
+-		// Also initialize b.N to 1 to kick start loop scaling.
+-		b.N = 1
+-		b.loopN = 1
++	// Consistency checks
 +	// if !b.timerOn {
 +	// 	b.Fatal("B.Loop called with timer stopped")
 +	// }
- 	if b.loop.i&loopPoisonMask != 0 {
- 		panic(fmt.Sprintf("unknown loop stop condition: %#x", b.loop.i))
- 	}
-
- 	if b.loop.n == 0 {
--		// If it's the first call to b.Loop() in the benchmark function.
--		// Allows more precise measurement of benchmark loop cost counts.
--		// Also initialize target to 1 to kick start loop scaling.
--		b.loop.n = 1
++	if b.loop.i&loopPoisonMask != 0 {
++		panic(fmt.Sprintf("unknown loop stop condition: %#x", b.loop.i))
++	}
++
++	if b.loop.n == 0 {
 +		// It's the first call to b.Loop() in the benchmark function.
 +		if b.benchTime.n > 0 {
 +			// Fixed iteration count.
@@ -209,9 +230,9 @@
 +			// Initialize target to 1 to kick start loop scaling.
 +			b.loop.n = 1
 +		}
- 		// Within a b.Loop loop, we don't use b.N (to avoid confusion).
- 		b.N = 0
--		b.loop.i++
++		// Within a b.Loop loop, we don't use b.N (to avoid confusion).
++		b.N = 0
++		b.loopStartTime = time.Now()
 +		b.codspeed.instrument_hooks.StartBenchmark()
  		b.ResetTimer()
 +		b.StartTimerWithoutMarker()
@@ -225,9 +246,9 @@
 +	// Should we keep iterating?
 +	var more bool
  	if b.benchTime.n > 0 {
--		if b.loop.n < uint64(b.benchTime.n) {
--			b.loop.n = uint64(b.benchTime.n)
--			b.loop.i++
+-		if b.N < b.benchTime.n {
+-			b.N = b.benchTime.n
+-			b.loopN++
 -			return true
 +		// The iteration count is fixed, so we should have run this many and now
 +		// be done.
@@ -239,7 +260,7 @@
 +		more = false
 +	} else {
 +		// Handle fixed time case
-+		more = b.stopOrScaleBLoop()
++		more = b.stopOrScaleBLoopCodspeed()
 +	}
 +	if !more {
 +		// NOTE: We could move the endTimestamp capturing further up or even into the Loop() function
@@ -257,9 +278,9 @@
 +		b.codspeed.instrument_hooks.StopBenchmark()
 +		b.sendAccumulatedTimestamps()
 +
- 		// Commit iteration count
- 		b.N = int(b.loop.n)
- 		b.loop.done = true
++		// Commit iteration count
++		b.N = int(b.loop.n)
++		b.loop.done = true
  		return false
  	}
 -	// Handles fixed time case
@@ -272,24 +293,51 @@
  }
 
  // Loop returns true as long as the benchmark should continue running.
-@@ -482,6 +553,8 @@
+@@ -440,13 +516,41 @@
  // whereas b.N-based benchmarks must run the benchmark function (and any
  // associated setup and cleanup) several times.
  func (b *B) Loop() bool {
+-	if b.loopN != 0 && b.loopN < b.N {
+-		b.loopN++
 +	b.StopTimerWithoutMarker()
 +	b.SaveMeasurement()
- 	// This is written such that the fast path is as fast as possible and can be
- 	// inlined.
- 	//
-@@ -496,6 +569,7 @@
- 	//   path can do consistency checks and fail.
- 	if b.loop.i < b.loop.n {
- 		b.loop.i++
++	// This is written such that the fast path is as fast as possible and can be
++	// inlined.
++	//
++	// There are three cases where we'll fall out of the fast path:
++	//
++	// - On the first call, both i and n are 0.
++	//
++	// - If the loop reaches the n'th iteration, then i == n and we need
++	//   to figure out the new target iteration count or if we're done.
++	//
++	// - If the timer is stopped, it poisons the top bit of i so the slow
++	//   path can do consistency checks and fail.
++	if b.loop.i < b.loop.n {
++		b.loop.i++
 +		b.StartTimerWithoutMarker()
  		return true
  	}
  	return b.loopSlowPath()
-@@ -522,6 +596,9 @@
+ }
+
++// The loopPoison constants can be OR'd into B.loop.i to cause it to fall back
++// to the slow path.
++const (
++	loopPoisonTimer = uint64(1 << (63 - iota))
++	// If necessary, add more poison bits here.
++
++	// loopPoisonMask is the set of all loop poison bits. (iota-1) is the index
++	// of the bit we just set, from which we recreate that bit mask. We subtract
++	// 1 to set all of the bits below that bit, then complement the result to
++	// get the mask. Sorry, not sorry.
++	loopPoisonMask = ^uint64((1 << (63 - (iota - 1))) - 1)
++)
++
+ // BenchmarkResult contains the results of a benchmark run.
+ type BenchmarkResult struct {
+ 	N         int           // The number of iterations.
+@@ -455,6 +559,9 @@
  	MemAllocs uint64        // The total number of memory allocations.
  	MemBytes  uint64        // The total number of bytes allocated.
 
@@ -299,7 +347,7 @@
  	// Extra records additional metrics reported by ReportMetric.
  	Extra map[string]float64
  }
-@@ -702,6 +779,9 @@
+@@ -635,6 +742,9 @@
  			w:     os.Stdout,
  			bench: true,
  		},
@@ -309,7 +357,7 @@
  		importPath: importPath,
  		benchFunc: func(b *B) {
  			for _, Benchmark := range bs {
-@@ -711,6 +791,8 @@
+@@ -644,6 +754,8 @@
  		benchTime: benchTime,
  		bstate:    bstate,
  	}
@@ -318,7 +366,7 @@
  	if Verbose() {
  		main.chatty = newChattyPrinter(main.w)
  	}
-@@ -739,6 +821,7 @@
+@@ -672,6 +784,7 @@
  						chatty: b.chatty,
  						bench:  true,
  					},
@@ -326,7 +374,7 @@
  					benchFunc: b.benchFunc,
  					benchTime: b.benchTime,
  				}
-@@ -746,6 +829,8 @@
+@@ -679,6 +792,8 @@
  			}
  			r := b.doBench()
  			if b.failed {
@@ -335,7 +383,7 @@
  				// The output could be very long here, but probably isn't.
  				// We print it all, regardless, because we don't want to trim the reason
  				// the benchmark failed.
-@@ -753,6 +838,8 @@
+@@ -686,6 +801,8 @@
  				continue
  			}
  			results := r.String()
@@ -344,7 +392,7 @@
  			if b.chatty != nil {
  				fmt.Fprintf(b.w, "%-*s\t", s.maxLen, benchName)
  			}
-@@ -813,6 +900,7 @@
+@@ -746,6 +863,7 @@
  			chatty:  b.chatty,
  			bench:   true,
  		},
diff --git a/go-runner/overlay/benchmark1.25.0.go b/go-runner/overlay/benchmark1.25.0.go
index 8fcb1d4..8a1ee29 100644
--- a/go-runner/overlay/benchmark1.25.0.go
+++ b/go-runner/overlay/benchmark1.25.0.go
@@ -356,45 +356,7 @@ func (b *B) launch() {
 				b.runN(b.benchTime.n)
 			}
 		} else {
-			warmupD := b.benchTime.d / 10
-			warmupN := int64(1)
-			for n := int64(1); !b.failed && b.duration < warmupD && n < 1e9; {
-				last := n
-				// Predict required iterations.
-				goalns := warmupD.Nanoseconds()
-				prevIters := int64(b.N)
-				n = int64(predictN(goalns, prevIters, b.duration.Nanoseconds(), last))
-				b.runN(int(n))
-				warmupN = n
-			}
-
-			// Reset the fields from the warmup run
-			b.ResetTimer()
-
-			// Final run:
-			benchD := b.benchTime.d
-			benchN := predictN(benchD.Nanoseconds(), int64(b.N), b.duration.Nanoseconds(), warmupN)
-
-			// When we have a very slow benchmark (e.g. taking 500ms), we have to:
-			// 1. Reduce the number of rounds to not slow down the process (e.g. by executing a 1s bench 100 times)
-			// 2. Not end up with roundN of 0 when dividing benchN (which can be < 100) by rounds
-			const minRounds = 100
-			var rounds int
-			var roundN int
-			if benchN < minRounds {
-				rounds = benchN
-				roundN = 1
-			} else {
-				rounds = minRounds
-				roundN = benchN / int(rounds)
-			}
-
-			b.codspeed.instrument_hooks.StartBenchmark()
-			for range rounds {
-				b.runN(int(roundN))
-			}
-			b.codspeed.instrument_hooks.StopBenchmark()
-			b.sendAccumulatedTimestamps()
+			runBenchmarkWithWarmup(b)
 		}
 	}
 	b.result = BenchmarkResult{b.N, b.duration, b.bytes, b.netAllocs, b.netBytes, b.codspeedTimePerRoundNs, b.codspeedItersPerRound, b.extra}
@@ -468,6 +430,7 @@ func (b *B) loopSlowPath() bool {
 		}
 		// Within a b.Loop loop, we don't use b.N (to avoid confusion).
 		b.N = 0
+		b.loopStartTime = time.Now()
 		b.codspeed.instrument_hooks.StartBenchmark()
 		b.ResetTimer()
 		b.StartTimerWithoutMarker()
@@ -489,7 +452,7 @@ func (b *B) loopSlowPath() bool {
 		more = false
 	} else {
 		// Handle fixed time case
-		more = b.stopOrScaleBLoop()
+		more = b.stopOrScaleBLoopCodspeed()
 	}
 	if !more {
 		// NOTE: We could move the endTimestamp capturing further up or even into the Loop() function
diff --git a/go-runner/overlay/benchmark1.25.0.patch b/go-runner/overlay/benchmark1.25.0.patch
index 6ed5443..c94d261 100644
--- a/go-runner/overlay/benchmark1.25.0.patch
+++ b/go-runner/overlay/benchmark1.25.0.patch
@@ -1,5 +1,5 @@
---- benchmark.go	2026-01-09 11:36:51.153087761 +0100
-+++ overlay/benchmark1.25.go	2026-01-09 11:58:31.662387782 +0100
+--- benchmark1.25.0.go	2026-01-16 16:35:48.032061438 +0100
++++ overlay/benchmark1.25.0.go	2026-01-16 16:30:04.538439669 +0100
 @@ -93,6 +93,7 @@
  // affecting benchmark results.
  type B struct {
@@ -110,52 +110,20 @@
  		fmt.Fprintf(b.w, "goos: %s\n", runtime.GOOS)
  		fmt.Fprintf(b.w, "goarch: %s\n", runtime.GOARCH)
  		if b.importPath != "" {
-@@ -344,18 +356,48 @@
+@@ -344,18 +356,10 @@
  				b.runN(b.benchTime.n)
  			}
  		} else {
 -			d := b.benchTime.d
 -			for n := int64(1); !b.failed && b.duration < d && n < 1e9; {
-+			warmupD := b.benchTime.d / 10
-+			warmupN := int64(1)
-+			for n := int64(1); !b.failed && b.duration < warmupD && n < 1e9; {
- 				last := n
- 				// Predict required iterations.
+-				last := n
+-				// Predict required iterations.
 -				goalns := d.Nanoseconds()
-+				goalns := warmupD.Nanoseconds()
- 				prevIters := int64(b.N)
- 				n = int64(predictN(goalns, prevIters, b.duration.Nanoseconds(), last))
- 				b.runN(int(n))
-+				warmupN = n
- 			}
-+
-+			// Reset the fields from the warmup run
-+			b.ResetTimer()
-+
-+			// Final run:
-+			benchD := b.benchTime.d
-+			benchN := predictN(benchD.Nanoseconds(), int64(b.N), b.duration.Nanoseconds(), warmupN)
-+
-+			// When we have a very slow benchmark (e.g. taking 500ms), we have to:
-+			// 1. Reduce the number of rounds to not slow down the process (e.g. by executing a 1s bench 100 times)
-+			// 2. Not end up with roundN of 0 when dividing benchN (which can be < 100) by rounds
-+			const minRounds = 100
-+			var rounds int
-+			var roundN int
-+			if benchN < minRounds {
-+				rounds = benchN
-+				roundN = 1
-+			} else {
-+				rounds = minRounds
-+				roundN = benchN / int(rounds)
-+			}
-+
-+			b.codspeed.instrument_hooks.StartBenchmark()
-+			for range rounds {
-+				b.runN(int(roundN))
-+			}
-+			b.codspeed.instrument_hooks.StopBenchmark()
-+			b.sendAccumulatedTimestamps()
+-				prevIters := int64(b.N)
+-				n = int64(predictN(goalns, prevIters, b.duration.Nanoseconds(), last))
+-				b.runN(int(n))
+-			}
++			runBenchmarkWithWarmup(b)
  		}
  	}
 -	b.result = BenchmarkResult{b.N, b.duration, b.bytes, b.netAllocs, b.netBytes, b.extra}
@@ -163,7 +131,7 @@
  }
 
  // Elapsed returns the measured elapsed time of the benchmark.
-@@ -408,9 +450,9 @@
+@@ -408,9 +412,9 @@
 
  func (b *B) loopSlowPath() bool {
  	// Consistency checks
@@ -176,18 +144,23 @@
  	if b.loop.i&loopPoisonMask != 0 {
  		panic(fmt.Sprintf("unknown loop stop condition: %#x", b.loop.i))
  	}
-@@ -426,7 +468,9 @@
+@@ -426,7 +430,10 @@
  		}
  		// Within a b.Loop loop, we don't use b.N (to avoid confusion).
  		b.N = 0
++		b.loopStartTime = time.Now()
 +		b.codspeed.instrument_hooks.StartBenchmark()
  		b.ResetTimer()
 +		b.StartTimerWithoutMarker()
 
  		// Start the next iteration.
  		b.loop.i++
-@@ -448,13 +492,28 @@
- 		more = b.stopOrScaleBLoop()
+@@ -445,16 +452,31 @@
+ 		more = false
+ 	} else {
+ 		// Handle fixed time case
+-		more = b.stopOrScaleBLoop()
++		more = b.stopOrScaleBLoopCodspeed()
  	}
  	if !more {
 -		b.StopTimer()
@@ -216,7 +189,7 @@
  	// Start the next iteration.
  	b.loop.i++
  	return true
-@@ -495,6 +554,8 @@
+@@ -495,6 +517,8 @@
  // whereas b.N-based benchmarks must run the benchmark function (and any
  // associated setup and cleanup) several times.
  func (b *B) Loop() bool {
@@ -225,7 +198,7 @@
  	// This is written such that the fast path is as fast as possible and can be
  	// inlined.
  	//
-@@ -509,6 +570,7 @@
+@@ -509,6 +533,7 @@
  	//   path can do consistency checks and fail.
  	if b.loop.i < b.loop.n {
  		b.loop.i++
@@ -233,7 +206,7 @@
  		return true
  	}
  	return b.loopSlowPath()
-@@ -535,6 +597,9 @@
+@@ -535,6 +560,9 @@
  	MemAllocs uint64        // The total number of memory allocations.
  	MemBytes  uint64        // The total number of bytes allocated.
 
@@ -243,7 +216,7 @@
  	// Extra records additional metrics reported by ReportMetric.
  	Extra map[string]float64
  }
-@@ -715,6 +780,9 @@
+@@ -715,6 +743,9 @@
  			w:     os.Stdout,
  			bench: true,
  		},
@@ -253,7 +226,7 @@
  		importPath: importPath,
  		benchFunc: func(b *B) {
  			for _, Benchmark := range bs {
-@@ -724,6 +792,8 @@
+@@ -724,6 +755,8 @@
  		benchTime: benchTime,
  		bstate:    bstate,
  	}
@@ -262,7 +235,7 @@
  	if Verbose() {
  		main.chatty = newChattyPrinter(main.w)
  	}
-@@ -752,6 +822,7 @@
+@@ -752,6 +785,7 @@
  						chatty: b.chatty,
  						bench:  true,
  					},
@@ -270,7 +243,7 @@
  					benchFunc: b.benchFunc,
  					benchTime: b.benchTime,
  				}
-@@ -760,6 +831,8 @@
+@@ -760,6 +794,8 @@
  			}
  			r := b.doBench()
  			if b.failed {
@@ -279,7 +252,7 @@
  				// The output could be very long here, but probably isn't.
  				// We print it all, regardless, because we don't want to trim the reason
  				// the benchmark failed.
-@@ -767,6 +840,8 @@
+@@ -767,6 +803,8 @@
  				continue
  			}
  			results := r.String()
@@ -288,7 +261,7 @@
  			if b.chatty != nil {
  				fmt.Fprintf(b.w, "%-*s\t", s.maxLen, benchName)
  			}
-@@ -827,6 +902,7 @@
+@@ -827,6 +865,7 @@
  			chatty:  b.chatty,
  			bench:   true,
  		},
diff --git a/go-runner/overlay/codspeed.go b/go-runner/overlay/codspeed.go
index a0de8b3..5e859e9 100644
--- a/go-runner/overlay/codspeed.go
+++ b/go-runner/overlay/codspeed.go
@@ -26,6 +26,42 @@ type codspeed struct {
 	// Indicates whether a measurement has been saved already. This aims to prevent saving measurements
 	// twice, because `b.Loop()` saves them internally as well but is also called from runN
 	savedMeasurement bool
+
+	// The start time of the first b.Loop() call. This includes the benchmark execution
+	// time, including the overhead of start/stop the timer each loop iteration.
+	loopStartTime time.Time
+}
+
+const BenchMaxTimeMult = 3
+
+// Modified version of the `stopOrScaleLoop` function to also take into account the
+// overhead of start/stop the timer each loop iteration.
+//
+// If we have large setups/teardowns within the loop, they won't count as benchmark time
+// which could cause the benchmark to run for too long.
+func (b *B) stopOrScaleBLoopCodspeed() bool {
+	// The total duration must be at most N times the requested benchtime
+	actualT := time.Since(b.loopStartTime)
+	if actualT >= b.benchTime.d*BenchMaxTimeMult {
+		return false
+	}
+
+	t := b.Elapsed()
+	if t >= b.benchTime.d {
+		// We've reached the target
+		return false
+	}
+
+	// Loop scaling
+	goalns := b.benchTime.d.Nanoseconds()
+	prevIters := int64(b.loop.n)
+	b.loop.n = uint64(predictN(goalns, prevIters, actualT.Nanoseconds(), prevIters))
+	if b.loop.n&loopPoisonMask != 0 {
+		// The iteration count should never get this high, but if it did we'd be
+		// in big trouble.
+		panic("loop iteration target overflow")
+	}
+	return true
 }
 
 func findGitRoot() (string, error) {
@@ -268,3 +304,58 @@ func (b *B) StartTimerWithoutMarker() {
 		// b.loop.i &^= loopPoisonTimer
 	}
 }
+
+func runBenchmarkWithWarmup(b *B) {
+	warmupD := b.benchTime.d / 10
+	warmupN := int64(1)
+	for n := int64(1); !b.failed && b.duration < warmupD && n < 1e9; {
+		last := n
+		// Predict required iterations.
+		goalns := warmupD.Nanoseconds()
+		prevIters := int64(b.N)
+		n = int64(predictN(goalns, prevIters, b.duration.Nanoseconds(), last))
+
+		// IMPORTANT: We have to measure the _whole_ execution time, to also take into account the setup/teardown time, which
+		// can be executed inside the loop. We can't execute 10k runs of 1ms when the setup takes 10ms every time.
+		start := time.Now()
+		b.runN(int(n))
+		b.duration = time.Since(start)
+
+		warmupN = n
+	}
+
+	// Reset the fields from the warmup run
+	b.ResetTimer()
+
+	// Final run:
+	benchD := b.benchTime.d
+	benchN := predictN(benchD.Nanoseconds(), int64(b.N), b.duration.Nanoseconds(), warmupN)
+
+	// When we have a very slow benchmark (e.g. taking 500ms), we have to:
+	// 1. Reduce the number of rounds to not slow down the process (e.g. by executing a 1s bench 100 times)
+	// 2. Not end up with roundN of 0 when dividing benchN (which can be < 100) by rounds
+	const minRounds = 100
+	var rounds int
+	var roundN int
+	if benchN < minRounds {
+		rounds = benchN
+		roundN = 1
+	} else {
+		rounds = minRounds
+		roundN = benchN / int(rounds)
+	}
+
+	benchStart := time.Now()
+	b.codspeed.instrument_hooks.StartBenchmark()
+	for range rounds {
+		b.runN(int(roundN))
+
+		// Ensure that we don't spend too much time running the benchmarks, bail if we exceed
+		// N times the requested benchtime. This is a failsafe, if the N prediction is flawed.
+		if time.Since(benchStart) > benchD*BenchMaxTimeMult {
+			break
+		}
+	}
+	b.codspeed.instrument_hooks.StopBenchmark()
+	b.sendAccumulatedTimestamps()
+}