From b41e7a80a8d258407debb07387ec436ba0f40e6e Mon Sep 17 00:00:00 2001 From: Zbigniew Tomanek Date: Fri, 19 Sep 2025 14:14:25 +0200 Subject: [PATCH] fix: Prevent TokenDecoder overflow causing substr exceptions in text summarization Both GPU and CPU inference can generate token offsets that overflow signed 32-bit integers, particularly when using gliner-multitask-large-v0.5 with Q4 quantization on x86 Linux servers with ONNX Runtime 1.20.1. This leads to spans with startIdx/endIdx values around 1.6e9 during text summarization tasks. When these invalid indices are passed to std::string::substr, it throws "basic_string::substr: __pos > this->size()" exceptions. Added bounds checking and safe text extraction: - adjustSpanToTextBounds() validates span indices against text bounds - safeCopySpanText() guards substr calls with validation - Invalid spans are now skipped instead of causing crashes - Both SpanDecoder and TokenDecoder use the safety functions Resolves crashes in text summarization tasks and maintains compatibility with existing regression tests. --- src/decoder.cpp | 57 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/src/decoder.cpp b/src/decoder.cpp index c4eb81a..4b91ceb 100644 --- a/src/decoder.cpp +++ b/src/decoder.cpp @@ -8,6 +8,43 @@ float sigmoid(float x) { return 1.0 / (1.0 + std::exp(-x)); } +static bool adjustSpanToTextBounds(const Span& span, size_t textLength) { + // Check for negative indices (overflow to large positive values) + if (span.startIdx < 0 || span.endIdx < 0) { + return false; + } + + // Check that indices don't exceed text length + if (static_cast(span.startIdx) >= textLength || + static_cast(span.endIdx) > textLength) { + return false; + } + + // Check that startIdx <= endIdx + if (span.startIdx > span.endIdx) { + return false; + } + + return true; +} + +static std::string safeCopySpanText(const Span& span, const std::string& text) { + // First validate span bounds + if (!adjustSpanToTextBounds(span, text.length())) { + return ""; + } + + // Double-check bounds before substr call + size_t startPos = static_cast(span.startIdx); + size_t endPos = static_cast(span.endIdx); + + if (startPos >= text.length() || endPos > text.length() || startPos > endPos) { + return ""; + } + + return text.substr(startPos, endPos - startPos); +} + bool Decoder::isNested(const Span& s1, const Span& s2) { return (s1.startIdx <= s2.startIdx && s2.endIdx <= s1.endIdx) || (s2.startIdx <= s1.startIdx && s1.endIdx <= s2.endIdx); } @@ -109,11 +146,16 @@ std::vector> SpanDecoder::decode( Span span; span.startIdx = tokens[batch_id][startToken].start; span.endIdx = tokens[batch_id][endToken].end; - span.text = texts[batch_id].substr(span.startIdx, span.endIdx - span.startIdx); span.classLabel = entities[entity]; span.prob = prob; - spans[batch_id].push_back(span); + // Safely extract span text with bounds checking + span.text = safeCopySpanText(span, texts[batch_id]); + + // Skip spans with invalid indices that couldn't extract text + if (!span.text.empty()) { + spans[batch_id].push_back(span); + } } } @@ -169,13 +211,18 @@ std::vector> TokenDecoder::decode( Span span; span.startIdx = tokens[batch_id][startToken].start; span.endIdx = tokens[batch_id][endToken].end; - span.text = texts[batch_id].substr(span.startIdx, span.endIdx - span.startIdx); span.classLabel = entities[entity]; span.prob = score_sum / n; - spans[batch_id].push_back(span); + // Safely extract span text with bounds checking + span.text = safeCopySpanText(span, texts[batch_id]); + + // Skip spans with invalid indices that couldn't extract text + if (!span.text.empty()) { + spans[batch_id].push_back(span); + } } } return batchGreedySearch(spans, flatNer, multiLabel); -} \ No newline at end of file +}