From b41e7a80a8d258407debb07387ec436ba0f40e6e Mon Sep 17 00:00:00 2001
From: Zbigniew Tomanek <me@zbeegnew.dev>
Date: Fri, 19 Sep 2025 14:14:25 +0200
Subject: [PATCH] fix: Prevent TokenDecoder overflow causing substr exceptions
 in text summarization

Both GPU and CPU inference can generate token offsets that overflow signed 32-bit integers,
particularly when using gliner-multitask-large-v0.5 with Q4 quantization on x86 Linux
servers with ONNX Runtime 1.20.1. This leads to spans with startIdx/endIdx values around
1.6e9 during text summarization tasks. When these invalid indices are passed to
std::string::substr, it throws "basic_string::substr: __pos > this->size()" exceptions.

Added bounds checking and safe text extraction:
- adjustSpanToTextBounds() validates span indices against text bounds
- safeCopySpanText() guards substr calls with validation
- Invalid spans are now skipped instead of causing crashes
- Both SpanDecoder and TokenDecoder use the safety functions

Resolves crashes in text summarization tasks and maintains compatibility with existing
regression tests.
---
 src/decoder.cpp | 57 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 5 deletions(-)
diff --git a/src/decoder.cpp b/src/decoder.cpp
index c4eb81a..4b91ceb 100644
--- a/src/decoder.cpp
+++ b/src/decoder.cpp
@@ -8,6 +8,43 @@ float sigmoid(float x) {
     return 1.0 / (1.0 + std::exp(-x));
 }
 
+static bool adjustSpanToTextBounds(const Span& span, size_t textLength) {
+    // Check for negative indices (overflow to large positive values)
+    if (span.startIdx < 0 || span.endIdx < 0) {
+        return false;
+    }
+
+    // Check that indices don't exceed text length
+    if (static_cast<size_t>(span.startIdx) >= textLength ||
+        static_cast<size_t>(span.endIdx) > textLength) {
+        return false;
+    }
+
+    // Check that startIdx <= endIdx
+    if (span.startIdx > span.endIdx) {
+        return false;
+    }
+
+    return true;
+}
+
+static std::string safeCopySpanText(const Span& span, const std::string& text) {
+    // First validate span bounds
+    if (!adjustSpanToTextBounds(span, text.length())) {
+        return "";
+    }
+
+    // Double-check bounds before substr call
+    size_t startPos = static_cast<size_t>(span.startIdx);
+    size_t endPos = static_cast<size_t>(span.endIdx);
+
+    if (startPos >= text.length() || endPos > text.length() || startPos > endPos) {
+        return "";
+    }
+
+    return text.substr(startPos, endPos - startPos);
+}
+
 bool Decoder::isNested(const Span& s1, const Span& s2) {
     return (s1.startIdx <= s2.startIdx && s2.endIdx <= s1.endIdx) || (s2.startIdx <= s1.startIdx && s1.endIdx <= s2.endIdx);
 }
@@ -109,11 +146,16 @@ std::vector<std::vector<Span>> SpanDecoder::decode(
             Span span;
             span.startIdx = tokens[batch_id][startToken].start;
             span.endIdx = tokens[batch_id][endToken].end;
-            span.text = texts[batch_id].substr(span.startIdx, span.endIdx - span.startIdx);
             span.classLabel = entities[entity];
             span.prob = prob;
 
-            spans[batch_id].push_back(span);
+            // Safely extract span text with bounds checking
+            span.text = safeCopySpanText(span, texts[batch_id]);
+
+            // Skip spans with invalid indices that couldn't extract text
+            if (!span.text.empty()) {
+                spans[batch_id].push_back(span);
+            }
         }
     }
 
@@ -169,13 +211,18 @@ std::vector<std::vector<Span>> TokenDecoder::decode(
             Span span;
             span.startIdx = tokens[batch_id][startToken].start;
             span.endIdx = tokens[batch_id][endToken].end;
-            span.text = texts[batch_id].substr(span.startIdx, span.endIdx - span.startIdx);
             span.classLabel = entities[entity];
             span.prob = score_sum / n;
 
-            spans[batch_id].push_back(span);
+            // Safely extract span text with bounds checking
+            span.text = safeCopySpanText(span, texts[batch_id]);
+
+            // Skip spans with invalid indices that couldn't extract text
+            if (!span.text.empty()) {
+                spans[batch_id].push_back(span);
+            }
         }
     }
 
     return batchGreedySearch(spans, flatNer, multiLabel);
-}
\ No newline at end of file
+}