From 5f895f90e069832360c8bb78795611e70bbe33c6 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Fri, 2 May 2025 23:49:37 +0600
Subject: [PATCH 01/12] `feature`: implemented `utf8.char`, `utf8.charpattern`,
 `utf8.codes`, `utf8.len`, `utf8.codepoint` and `utf8.offset`

---
 src/stdlib/mod.rs  |   3 +-
 src/stdlib/utf8.rs | 351 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 353 insertions(+), 1 deletion(-)
 create mode 100644 src/stdlib/utf8.rs

diff --git a/src/stdlib/mod.rs b/src/stdlib/mod.rs
index aa766153..d7695ee4 100644
--- a/src/stdlib/mod.rs
+++ b/src/stdlib/mod.rs
@@ -4,8 +4,9 @@ mod io;
 mod math;
 mod string;
 mod table;
+mod utf8;
 
 pub use self::{
     base::load_base, coroutine::load_coroutine, io::load_io, math::load_math, string::load_string,
-    table::load_table,
+    table::load_table, utf8::load_utf8,
 };
diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
new file mode 100644
index 00000000..b2372081
--- /dev/null
+++ b/src/stdlib/utf8.rs
@@ -0,0 +1,351 @@
+use gc_arena::Collect;
+
+use crate::{BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll, Table, Value};
+
+fn utf8_sequence_length<'gc>(ctx: Context<'gc>, byte: u8, position: usize) -> Result<usize, Error<'gc>> {
+    if byte & 0x80 == 0 {
+        Ok(1)
+    } else if byte & 0xE0 == 0xC0 {
+        Ok(2)
+    } else if byte & 0xF0 == 0xE0 {
+        Ok(3)
+    } else if byte & 0xF8 == 0xF0 {
+        Ok(4)
+    } else {
+        Err(format!("invalid UTF-8 sequence at position {}", position + 1)
+            .into_value(ctx).into())
+    }
+}
+
+fn validate_utf8_sequence<'gc>(ctx: Context<'gc>, position: usize, expected_bytes: usize, bytes: &[u8]) -> Result<(), Error<'gc>> {
+    if position + expected_bytes > bytes.len() {
+        return Err(format!("incomplete UTF-8 code at position {}", position + 1)
+                      .into_value(ctx).into());
+    }
+    
+    for i in 1..expected_bytes {
+        if bytes[position + i] & 0xC0 != 0x80 {
+            return Err(format!("invalid UTF-8 code at position {}", position + 1)
+                      .into_value(ctx).into());
+        }
+    }
+    
+    Ok(())
+}
+
+fn decode_utf8_codepoint(position: usize, expected_bytes: usize, bytes: &[u8]) -> u32 {
+    match expected_bytes {
+        1 => bytes[position] as u32,
+        2 => {
+            ((bytes[position] & 0x1F) as u32) << 6 |
+            ((bytes[position + 1] & 0x3F) as u32)
+        }
+        3 => {
+            ((bytes[position] & 0x0F) as u32) << 12 |
+            ((bytes[position + 1] & 0x3F) as u32) << 6 |
+            ((bytes[position + 2] & 0x3F) as u32)
+        }
+        4 => {
+            ((bytes[position] & 0x07) as u32) << 18 |
+            ((bytes[position + 1] & 0x3F) as u32) << 12 |
+            ((bytes[position + 2] & 0x3F) as u32) << 6 |
+            ((bytes[position + 3] & 0x3F) as u32)
+        }
+        _ => unreachable!() // this should never happen!!
+    }
+}
+
+fn adjust_index(index: i64, len: usize) -> usize {
+    if index > 0 {
+        index.saturating_sub(1) as usize
+    } else if index < 0 {
+        len.saturating_sub(index.unsigned_abs() as usize)
+    } else {
+        0
+    }
+}
+
+fn calculate_string_range(start: usize, end: usize, len: usize) -> Option<(usize, usize)> {
+    if start >= len || (end < start && end != 0) {
+        None
+    } else {
+        Some((start, end.min(len)))
+    }
+}
+
+pub fn load_utf8(ctx: Context) {
+    let utf8 = Table::new(&ctx);
+    
+    utf8.set_field(ctx, "char", Callback::from_fn(&ctx, |ctx, _, mut stack| {
+        let mut bytes = Vec::with_capacity(stack.len() * 4);
+        let iter = stack.into_iter().enumerate();
+        
+        for (idx, i) in iter {
+            let code = match i.to_integer() {
+                Some(code) => code as u32,
+                None => {
+                    return Err(format!("bad argument #{} to 'char' (number expected, got {})", 
+                                     idx + 1, i.type_name()).into_value(ctx).into())
+                }
+            };
+            
+            if let Some(c) = char::from_u32(code) {
+                let mut buf = [0; 4];
+                let utf8_bytes = c.encode_utf8(&mut buf).as_bytes();
+                bytes.extend_from_slice(utf8_bytes);
+            } else {
+                return Err(format!("bad argument #{} to 'char' (value out of range)", 
+                                 idx + 1).into_value(ctx).into())
+            }
+        }
+        
+        let result = ctx.intern(&bytes);
+        stack.replace(ctx, result);
+        
+        Ok(CallbackReturn::Return)
+    }));
+    
+    let _ = utf8.set(ctx, "charpattern", r"[\0-\x7F\xC2-\xF4][\x80-\xBF]*");
+
+    utf8.set_field(ctx, "codes", Callback::from_fn(&ctx, |ctx, _, mut stack| {
+        #[derive(Collect, Clone)]
+        #[collect(require_static)]
+        struct Codes {
+            s: String,
+            pos: usize,
+        }
+
+        impl<'gc> Sequence<'gc> for Codes {
+            fn poll(
+                    mut self: std::pin::Pin<&mut Self>,
+                    ctx: Context<'gc>,
+                    _exec: crate::Execution<'gc, '_>,
+                    mut stack: crate::Stack<'gc, '_>,
+                ) -> Result<SequencePoll<'gc>, Error<'gc>> {
+                    let position = self.pos;
+                    let bytes = self.s.as_bytes();
+                    let len = bytes.len();
+                    
+                    if position >= len {
+                        stack.replace(ctx, Value::Nil);
+                        return Ok(SequencePoll::Return);
+                    }
+                    
+                    let byte = bytes[position];
+                    
+                    let expected_bytes = utf8_sequence_length(ctx, byte, position)?;
+                    
+                    validate_utf8_sequence(ctx, position, expected_bytes, bytes)?;
+                    
+                    let code_point = decode_utf8_codepoint(position, expected_bytes, bytes);
+                    
+                    stack.clear();
+                    stack.into_back(ctx, position as i64 + 1);
+                    stack.into_back(ctx, code_point as i64);
+                    
+                    self.pos += expected_bytes;
+                    
+                    Ok(SequencePoll::Return)
+            }
+        }
+
+        let s = stack.consume::<String>(ctx)?;
+
+        let root = Codes {
+            s: s.to_owned(),
+            pos: 0
+        };
+        
+        let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| {
+            Ok(CallbackReturn::Sequence(BoxSequence::new(&ctx, root.clone())))
+        });
+        
+        stack.replace(ctx, codes);
+        
+        Ok(CallbackReturn::Return)
+    }));
+
+    utf8.set_field(ctx, "len", Callback::from_fn(&ctx, |ctx, _, mut stack| {
+        let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
+        let bytes = s.as_bytes();
+        let len = bytes.len();
+        
+        let i = i.unwrap_or(1);
+        let j = j.unwrap_or(-1);
+
+        let start = adjust_index(i, len);
+        let end = adjust_index(j, len);
+        
+        let (start, end) = match calculate_string_range(start, end, len) {
+            Some(range) => range,
+            None => {
+                stack.replace(ctx, 0);
+                return Ok(CallbackReturn::Return);
+            }
+        };
+        
+        let mut char_count = 0;
+        let mut position = start;
+        
+        while position < end {
+            let byte = bytes[position];
+            
+            let expected_bytes = match utf8_sequence_length(ctx, byte, position) {
+                Ok(len) => len,
+                Err(_) => {
+                    stack.clear();
+                    stack.push_back(Value::Boolean(false));
+                    stack.push_back(Value::Integer(position as i64 + 1));
+                    return Ok(CallbackReturn::Return);
+                }
+            };
+            
+            if position + expected_bytes > end {
+                break;
+            }
+            
+            match validate_utf8_sequence(ctx, position, expected_bytes, bytes) {
+                Ok(_) => {},
+                Err(_) => {
+                    stack.clear();
+                    stack.push_back(Value::Boolean(false));
+                    stack.push_back(Value::Integer(position as i64 + 1));
+                    return Ok(CallbackReturn::Return);
+                }
+            }
+            
+            char_count += 1;
+            position += expected_bytes;
+        }
+        
+        stack.replace(ctx, char_count);
+        Ok(CallbackReturn::Return)
+    }));
+    
+    utf8.set_field(ctx, "codepoint", Callback::from_fn(&ctx, |ctx, _, mut stack| {
+        let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
+        let bytes = s.as_bytes();
+        let len = bytes.len();
+        
+        let i = i.unwrap_or(1);
+        let j = j.unwrap_or(i);
+
+        let start = adjust_index(i, len);
+        let end = adjust_index(j, len);
+        
+        if start >= len || end >= len || end < start {
+            return Ok(CallbackReturn::Return)
+        }
+        
+        let mut position = start;
+        let mut codepoints = Vec::new();
+        
+        while position <= end {
+            if position >= len {
+                break;
+            }
+            
+            let byte = bytes[position];
+            
+            let expected_bytes = utf8_sequence_length(ctx, byte, position)?;
+            
+            validate_utf8_sequence(ctx, position, expected_bytes, bytes)?;
+            
+            let code_point = decode_utf8_codepoint(position, expected_bytes, bytes);
+            
+            if position <= end {
+                codepoints.push(code_point as i64);
+            }
+            
+            position += expected_bytes;
+        }
+        
+        stack.clear();
+        for codepoint in codepoints {
+            stack.push_back(Value::Integer(codepoint));
+        }
+        
+        Ok(CallbackReturn::Return)
+    }));
+
+    utf8.set_field(ctx, "offset", Callback::from_fn(&ctx, |ctx, _, mut stack| {
+        let (s, n, i): (String, i64, Option<i64>) = stack.consume(ctx)?;
+        let bytes = s.as_bytes();
+        let len = bytes.len();
+        
+        let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 });
+        
+        let mut pos = adjust_index(i, len);
+        
+        if n == 0 {
+            if pos >= len {
+                stack.replace(ctx, Value::Nil);
+                return Ok(CallbackReturn::Return);
+            }
+            
+            while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
+                pos -= 1;
+            }
+            
+            stack.replace(ctx, (pos as i64) + 1);
+            return Ok(CallbackReturn::Return);
+        }
+        
+        if n > 0 {
+            let mut count = 0;
+            
+            while count < n && pos < len {
+                if (bytes[pos] & 0xC0) != 0x80 {
+                    count += 1;
+                }
+                
+                if count == n {
+                    break;
+                }
+                
+                pos += 1;
+            }
+            
+            if count == n - 1 && pos == len {
+                stack.replace(ctx, (pos as i64) + 1);
+                return Ok(CallbackReturn::Return);
+            } else if count < n {
+                stack.replace(ctx, Value::Nil);
+                return Ok(CallbackReturn::Return);
+            }
+        } else if n < 0 {
+            let mut count = 0;
+            
+            if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
+                while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
+                    pos -= 1;
+                }
+                if pos > 0 {
+                    pos -= 1;
+                }
+            } else if pos > 0 {
+                pos -= 1;
+            }
+            
+            while count < (-n) && pos > 0 {
+                pos -= 1;
+                
+                while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
+                    pos -= 1;
+                }
+                
+                count += 1;
+            }
+            
+            if count < (-n) {
+                stack.replace(ctx, Value::Nil);
+                return Ok(CallbackReturn::Return);
+            }
+        }
+        
+        stack.replace(ctx, (pos as i64) + 1);
+        Ok(CallbackReturn::Return)
+    }));
+    
+    ctx.set_global("utf8", utf8);
+}
\ No newline at end of file

From b5643dba63eabd99fcb38f50a7e184ed755d4a74 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Fri, 2 May 2025 23:49:55 +0600
Subject: [PATCH 02/12] cargo fmt

---
 src/stdlib/utf8.rs | 543 ++++++++++++++++++++++++---------------------
 1 file changed, 295 insertions(+), 248 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index b2372081..3304a886 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -1,8 +1,15 @@
 use gc_arena::Collect;
 
-use crate::{BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll, Table, Value};
+use crate::{
+    BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll,
+    Table, Value,
+};
 
-fn utf8_sequence_length<'gc>(ctx: Context<'gc>, byte: u8, position: usize) -> Result<usize, Error<'gc>> {
+fn utf8_sequence_length<'gc>(
+    ctx: Context<'gc>,
+    byte: u8,
+    position: usize,
+) -> Result<usize, Error<'gc>> {
     if byte & 0x80 == 0 {
         Ok(1)
     } else if byte & 0xE0 == 0xC0 {
@@ -12,46 +19,55 @@ fn utf8_sequence_length<'gc>(ctx: Context<'gc>, byte: u8, position: usize) -> Re
     } else if byte & 0xF8 == 0xF0 {
         Ok(4)
     } else {
-        Err(format!("invalid UTF-8 sequence at position {}", position + 1)
-            .into_value(ctx).into())
+        Err(
+            format!("invalid UTF-8 sequence at position {}", position + 1)
+                .into_value(ctx)
+                .into(),
+        )
     }
 }
 
-fn validate_utf8_sequence<'gc>(ctx: Context<'gc>, position: usize, expected_bytes: usize, bytes: &[u8]) -> Result<(), Error<'gc>> {
+fn validate_utf8_sequence<'gc>(
+    ctx: Context<'gc>,
+    position: usize,
+    expected_bytes: usize,
+    bytes: &[u8],
+) -> Result<(), Error<'gc>> {
     if position + expected_bytes > bytes.len() {
-        return Err(format!("incomplete UTF-8 code at position {}", position + 1)
-                      .into_value(ctx).into());
+        return Err(
+            format!("incomplete UTF-8 code at position {}", position + 1)
+                .into_value(ctx)
+                .into(),
+        );
     }
-    
+
     for i in 1..expected_bytes {
         if bytes[position + i] & 0xC0 != 0x80 {
             return Err(format!("invalid UTF-8 code at position {}", position + 1)
-                      .into_value(ctx).into());
+                .into_value(ctx)
+                .into());
         }
     }
-    
+
     Ok(())
 }
 
 fn decode_utf8_codepoint(position: usize, expected_bytes: usize, bytes: &[u8]) -> u32 {
     match expected_bytes {
         1 => bytes[position] as u32,
-        2 => {
-            ((bytes[position] & 0x1F) as u32) << 6 |
-            ((bytes[position + 1] & 0x3F) as u32)
-        }
+        2 => ((bytes[position] & 0x1F) as u32) << 6 | ((bytes[position + 1] & 0x3F) as u32),
         3 => {
-            ((bytes[position] & 0x0F) as u32) << 12 |
-            ((bytes[position + 1] & 0x3F) as u32) << 6 |
-            ((bytes[position + 2] & 0x3F) as u32)
+            ((bytes[position] & 0x0F) as u32) << 12
+                | ((bytes[position + 1] & 0x3F) as u32) << 6
+                | ((bytes[position + 2] & 0x3F) as u32)
         }
         4 => {
-            ((bytes[position] & 0x07) as u32) << 18 |
-            ((bytes[position + 1] & 0x3F) as u32) << 12 |
-            ((bytes[position + 2] & 0x3F) as u32) << 6 |
-            ((bytes[position + 3] & 0x3F) as u32)
+            ((bytes[position] & 0x07) as u32) << 18
+                | ((bytes[position + 1] & 0x3F) as u32) << 12
+                | ((bytes[position + 2] & 0x3F) as u32) << 6
+                | ((bytes[position + 3] & 0x3F) as u32)
         }
-        _ => unreachable!() // this should never happen!!
+        _ => unreachable!(), // this should never happen!!
     }
 }
 
@@ -75,48 +91,63 @@ fn calculate_string_range(start: usize, end: usize, len: usize) -> Option<(usize
 
 pub fn load_utf8(ctx: Context) {
     let utf8 = Table::new(&ctx);
-    
-    utf8.set_field(ctx, "char", Callback::from_fn(&ctx, |ctx, _, mut stack| {
-        let mut bytes = Vec::with_capacity(stack.len() * 4);
-        let iter = stack.into_iter().enumerate();
-        
-        for (idx, i) in iter {
-            let code = match i.to_integer() {
-                Some(code) => code as u32,
-                None => {
-                    return Err(format!("bad argument #{} to 'char' (number expected, got {})", 
-                                     idx + 1, i.type_name()).into_value(ctx).into())
+
+    utf8.set_field(
+        ctx,
+        "char",
+        Callback::from_fn(&ctx, |ctx, _, mut stack| {
+            let mut bytes = Vec::with_capacity(stack.len() * 4);
+            let iter = stack.into_iter().enumerate();
+
+            for (idx, i) in iter {
+                let code = match i.to_integer() {
+                    Some(code) => code as u32,
+                    None => {
+                        return Err(format!(
+                            "bad argument #{} to 'char' (number expected, got {})",
+                            idx + 1,
+                            i.type_name()
+                        )
+                        .into_value(ctx)
+                        .into())
+                    }
+                };
+
+                if let Some(c) = char::from_u32(code) {
+                    let mut buf = [0; 4];
+                    let utf8_bytes = c.encode_utf8(&mut buf).as_bytes();
+                    bytes.extend_from_slice(utf8_bytes);
+                } else {
+                    return Err(
+                        format!("bad argument #{} to 'char' (value out of range)", idx + 1)
+                            .into_value(ctx)
+                            .into(),
+                    );
                 }
-            };
-            
-            if let Some(c) = char::from_u32(code) {
-                let mut buf = [0; 4];
-                let utf8_bytes = c.encode_utf8(&mut buf).as_bytes();
-                bytes.extend_from_slice(utf8_bytes);
-            } else {
-                return Err(format!("bad argument #{} to 'char' (value out of range)", 
-                                 idx + 1).into_value(ctx).into())
             }
-        }
-        
-        let result = ctx.intern(&bytes);
-        stack.replace(ctx, result);
-        
-        Ok(CallbackReturn::Return)
-    }));
-    
+
+            let result = ctx.intern(&bytes);
+            stack.replace(ctx, result);
+
+            Ok(CallbackReturn::Return)
+        }),
+    );
+
     let _ = utf8.set(ctx, "charpattern", r"[\0-\x7F\xC2-\xF4][\x80-\xBF]*");
 
-    utf8.set_field(ctx, "codes", Callback::from_fn(&ctx, |ctx, _, mut stack| {
-        #[derive(Collect, Clone)]
-        #[collect(require_static)]
-        struct Codes {
-            s: String,
-            pos: usize,
-        }
+    utf8.set_field(
+        ctx,
+        "codes",
+        Callback::from_fn(&ctx, |ctx, _, mut stack| {
+            #[derive(Collect, Clone)]
+            #[collect(require_static)]
+            struct Codes {
+                s: String,
+                pos: usize,
+            }
 
-        impl<'gc> Sequence<'gc> for Codes {
-            fn poll(
+            impl<'gc> Sequence<'gc> for Codes {
+                fn poll(
                     mut self: std::pin::Pin<&mut Self>,
                     ctx: Context<'gc>,
                     _exec: crate::Execution<'gc, '_>,
@@ -125,227 +156,243 @@ pub fn load_utf8(ctx: Context) {
                     let position = self.pos;
                     let bytes = self.s.as_bytes();
                     let len = bytes.len();
-                    
+
                     if position >= len {
                         stack.replace(ctx, Value::Nil);
                         return Ok(SequencePoll::Return);
                     }
-                    
+
                     let byte = bytes[position];
-                    
+
                     let expected_bytes = utf8_sequence_length(ctx, byte, position)?;
-                    
+
                     validate_utf8_sequence(ctx, position, expected_bytes, bytes)?;
-                    
+
                     let code_point = decode_utf8_codepoint(position, expected_bytes, bytes);
-                    
+
                     stack.clear();
                     stack.into_back(ctx, position as i64 + 1);
                     stack.into_back(ctx, code_point as i64);
-                    
+
                     self.pos += expected_bytes;
-                    
+
                     Ok(SequencePoll::Return)
+                }
             }
-        }
 
-        let s = stack.consume::<String>(ctx)?;
-
-        let root = Codes {
-            s: s.to_owned(),
-            pos: 0
-        };
-        
-        let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| {
-            Ok(CallbackReturn::Sequence(BoxSequence::new(&ctx, root.clone())))
-        });
-        
-        stack.replace(ctx, codes);
-        
-        Ok(CallbackReturn::Return)
-    }));
-
-    utf8.set_field(ctx, "len", Callback::from_fn(&ctx, |ctx, _, mut stack| {
-        let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
-        let bytes = s.as_bytes();
-        let len = bytes.len();
-        
-        let i = i.unwrap_or(1);
-        let j = j.unwrap_or(-1);
-
-        let start = adjust_index(i, len);
-        let end = adjust_index(j, len);
-        
-        let (start, end) = match calculate_string_range(start, end, len) {
-            Some(range) => range,
-            None => {
-                stack.replace(ctx, 0);
-                return Ok(CallbackReturn::Return);
-            }
-        };
-        
-        let mut char_count = 0;
-        let mut position = start;
-        
-        while position < end {
-            let byte = bytes[position];
-            
-            let expected_bytes = match utf8_sequence_length(ctx, byte, position) {
-                Ok(len) => len,
-                Err(_) => {
-                    stack.clear();
-                    stack.push_back(Value::Boolean(false));
-                    stack.push_back(Value::Integer(position as i64 + 1));
+            let s = stack.consume::<String>(ctx)?;
+
+            let root = Codes {
+                s: s.to_owned(),
+                pos: 0,
+            };
+
+            let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| {
+                Ok(CallbackReturn::Sequence(BoxSequence::new(
+                    &ctx,
+                    root.clone(),
+                )))
+            });
+
+            stack.replace(ctx, codes);
+
+            Ok(CallbackReturn::Return)
+        }),
+    );
+
+    utf8.set_field(
+        ctx,
+        "len",
+        Callback::from_fn(&ctx, |ctx, _, mut stack| {
+            let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
+            let bytes = s.as_bytes();
+            let len = bytes.len();
+
+            let i = i.unwrap_or(1);
+            let j = j.unwrap_or(-1);
+
+            let start = adjust_index(i, len);
+            let end = adjust_index(j, len);
+
+            let (start, end) = match calculate_string_range(start, end, len) {
+                Some(range) => range,
+                None => {
+                    stack.replace(ctx, 0);
                     return Ok(CallbackReturn::Return);
                 }
             };
-            
-            if position + expected_bytes > end {
-                break;
-            }
-            
-            match validate_utf8_sequence(ctx, position, expected_bytes, bytes) {
-                Ok(_) => {},
-                Err(_) => {
-                    stack.clear();
-                    stack.push_back(Value::Boolean(false));
-                    stack.push_back(Value::Integer(position as i64 + 1));
-                    return Ok(CallbackReturn::Return);
+
+            let mut char_count = 0;
+            let mut position = start;
+
+            while position < end {
+                let byte = bytes[position];
+
+                let expected_bytes = match utf8_sequence_length(ctx, byte, position) {
+                    Ok(len) => len,
+                    Err(_) => {
+                        stack.clear();
+                        stack.push_back(Value::Boolean(false));
+                        stack.push_back(Value::Integer(position as i64 + 1));
+                        return Ok(CallbackReturn::Return);
+                    }
+                };
+
+                if position + expected_bytes > end {
+                    break;
                 }
+
+                match validate_utf8_sequence(ctx, position, expected_bytes, bytes) {
+                    Ok(_) => {}
+                    Err(_) => {
+                        stack.clear();
+                        stack.push_back(Value::Boolean(false));
+                        stack.push_back(Value::Integer(position as i64 + 1));
+                        return Ok(CallbackReturn::Return);
+                    }
+                }
+
+                char_count += 1;
+                position += expected_bytes;
             }
-            
-            char_count += 1;
-            position += expected_bytes;
-        }
-        
-        stack.replace(ctx, char_count);
-        Ok(CallbackReturn::Return)
-    }));
-    
-    utf8.set_field(ctx, "codepoint", Callback::from_fn(&ctx, |ctx, _, mut stack| {
-        let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
-        let bytes = s.as_bytes();
-        let len = bytes.len();
-        
-        let i = i.unwrap_or(1);
-        let j = j.unwrap_or(i);
-
-        let start = adjust_index(i, len);
-        let end = adjust_index(j, len);
-        
-        if start >= len || end >= len || end < start {
-            return Ok(CallbackReturn::Return)
-        }
-        
-        let mut position = start;
-        let mut codepoints = Vec::new();
-        
-        while position <= end {
-            if position >= len {
-                break;
-            }
-            
-            let byte = bytes[position];
-            
-            let expected_bytes = utf8_sequence_length(ctx, byte, position)?;
-            
-            validate_utf8_sequence(ctx, position, expected_bytes, bytes)?;
-            
-            let code_point = decode_utf8_codepoint(position, expected_bytes, bytes);
-            
-            if position <= end {
-                codepoints.push(code_point as i64);
-            }
-            
-            position += expected_bytes;
-        }
-        
-        stack.clear();
-        for codepoint in codepoints {
-            stack.push_back(Value::Integer(codepoint));
-        }
-        
-        Ok(CallbackReturn::Return)
-    }));
-
-    utf8.set_field(ctx, "offset", Callback::from_fn(&ctx, |ctx, _, mut stack| {
-        let (s, n, i): (String, i64, Option<i64>) = stack.consume(ctx)?;
-        let bytes = s.as_bytes();
-        let len = bytes.len();
-        
-        let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 });
-        
-        let mut pos = adjust_index(i, len);
-        
-        if n == 0 {
-            if pos >= len {
-                stack.replace(ctx, Value::Nil);
+
+            stack.replace(ctx, char_count);
+            Ok(CallbackReturn::Return)
+        }),
+    );
+
+    utf8.set_field(
+        ctx,
+        "codepoint",
+        Callback::from_fn(&ctx, |ctx, _, mut stack| {
+            let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
+            let bytes = s.as_bytes();
+            let len = bytes.len();
+
+            let i = i.unwrap_or(1);
+            let j = j.unwrap_or(i);
+
+            let start = adjust_index(i, len);
+            let end = adjust_index(j, len);
+
+            if start >= len || end >= len || end < start {
                 return Ok(CallbackReturn::Return);
             }
-            
-            while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
-                pos -= 1;
-            }
-            
-            stack.replace(ctx, (pos as i64) + 1);
-            return Ok(CallbackReturn::Return);
-        }
-        
-        if n > 0 {
-            let mut count = 0;
-            
-            while count < n && pos < len {
-                if (bytes[pos] & 0xC0) != 0x80 {
-                    count += 1;
-                }
-                
-                if count == n {
+
+            let mut position = start;
+            let mut codepoints = Vec::new();
+
+            while position <= end {
+                if position >= len {
                     break;
                 }
-                
-                pos += 1;
+
+                let byte = bytes[position];
+
+                let expected_bytes = utf8_sequence_length(ctx, byte, position)?;
+
+                validate_utf8_sequence(ctx, position, expected_bytes, bytes)?;
+
+                let code_point = decode_utf8_codepoint(position, expected_bytes, bytes);
+
+                if position <= end {
+                    codepoints.push(code_point as i64);
+                }
+
+                position += expected_bytes;
             }
-            
-            if count == n - 1 && pos == len {
+
+            stack.clear();
+            for codepoint in codepoints {
+                stack.push_back(Value::Integer(codepoint));
+            }
+
+            Ok(CallbackReturn::Return)
+        }),
+    );
+
+    utf8.set_field(
+        ctx,
+        "offset",
+        Callback::from_fn(&ctx, |ctx, _, mut stack| {
+            let (s, n, i): (String, i64, Option<i64>) = stack.consume(ctx)?;
+            let bytes = s.as_bytes();
+            let len = bytes.len();
+
+            let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 });
+
+            let mut pos = adjust_index(i, len);
+
+            if n == 0 {
+                if pos >= len {
+                    stack.replace(ctx, Value::Nil);
+                    return Ok(CallbackReturn::Return);
+                }
+
+                while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
+                    pos -= 1;
+                }
+
                 stack.replace(ctx, (pos as i64) + 1);
                 return Ok(CallbackReturn::Return);
-            } else if count < n {
-                stack.replace(ctx, Value::Nil);
-                return Ok(CallbackReturn::Return);
             }
-        } else if n < 0 {
-            let mut count = 0;
-            
-            if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
-                while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
-                    pos -= 1;
+
+            if n > 0 {
+                let mut count = 0;
+
+                while count < n && pos < len {
+                    if (bytes[pos] & 0xC0) != 0x80 {
+                        count += 1;
+                    }
+
+                    if count == n {
+                        break;
+                    }
+
+                    pos += 1;
                 }
-                if pos > 0 {
+
+                if count == n - 1 && pos == len {
+                    stack.replace(ctx, (pos as i64) + 1);
+                    return Ok(CallbackReturn::Return);
+                } else if count < n {
+                    stack.replace(ctx, Value::Nil);
+                    return Ok(CallbackReturn::Return);
+                }
+            } else if n < 0 {
+                let mut count = 0;
+
+                if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
+                    while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
+                        pos -= 1;
+                    }
+                    if pos > 0 {
+                        pos -= 1;
+                    }
+                } else if pos > 0 {
                     pos -= 1;
                 }
-            } else if pos > 0 {
-                pos -= 1;
-            }
-            
-            while count < (-n) && pos > 0 {
-                pos -= 1;
-                
-                while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
+
+                while count < (-n) && pos > 0 {
                     pos -= 1;
+
+                    while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
+                        pos -= 1;
+                    }
+
+                    count += 1;
+                }
+
+                if count < (-n) {
+                    stack.replace(ctx, Value::Nil);
+                    return Ok(CallbackReturn::Return);
                 }
-                
-                count += 1;
-            }
-            
-            if count < (-n) {
-                stack.replace(ctx, Value::Nil);
-                return Ok(CallbackReturn::Return);
             }
-        }
-        
-        stack.replace(ctx, (pos as i64) + 1);
-        Ok(CallbackReturn::Return)
-    }));
-    
+
+            stack.replace(ctx, (pos as i64) + 1);
+            Ok(CallbackReturn::Return)
+        }),
+    );
+
     ctx.set_global("utf8", utf8);
-}
\ No newline at end of file
+}

From d5447babbc1c353b8e8360db2e37bba3415a419e Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Sat, 3 May 2025 01:11:34 +0600
Subject: [PATCH 03/12] `feature`: added tests for all functions, fix some bugs

---
 src/lua.rs             |   3 +-
 src/stdlib/utf8.rs     | 162 +++++++++++++++----------
 tests/scripts/utf8.lua | 260 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+), 61 deletions(-)
 create mode 100644 tests/scripts/utf8.lua

diff --git a/src/lua.rs b/src/lua.rs
index 67f03007..ce270d87 100644
--- a/src/lua.rs
+++ b/src/lua.rs
@@ -9,7 +9,7 @@ use gc_arena::{
 use crate::{
     finalizers::Finalizers,
     stash::{Fetchable, Stashable},
-    stdlib::{load_base, load_coroutine, load_io, load_math, load_string, load_table},
+    stdlib::{load_base, load_coroutine, load_io, load_math, load_string, load_table, load_utf8},
     string::InternedStringSet,
     thread::BadThreadMode,
     Error, ExternError, FromMultiValue, FromValue, Fuel, IntoValue, Registry, RuntimeError,
@@ -176,6 +176,7 @@ impl Lua {
             load_math(ctx);
             load_string(ctx);
             load_table(ctx);
+            load_utf8(ctx);
         })
     }
 
diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 3304a886..cd85c04d 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -1,3 +1,5 @@
+use std::{rc::Rc, sync::atomic::AtomicUsize, sync::atomic::Ordering};
+
 use gc_arena::Collect;
 
 use crate::{
@@ -118,11 +120,12 @@ pub fn load_utf8(ctx: Context) {
                     let utf8_bytes = c.encode_utf8(&mut buf).as_bytes();
                     bytes.extend_from_slice(utf8_bytes);
                 } else {
-                    return Err(
-                        format!("bad argument #{} to 'char' (value out of range)", idx + 1)
-                            .into_value(ctx)
-                            .into(),
-                    );
+                    return Err(format!(
+                        "bad argument #{} to 'char' (value out of range)",
+                        idx + 1
+                    )
+                    .into_value(ctx)
+                    .into());
                 }
             }
 
@@ -143,38 +146,48 @@ pub fn load_utf8(ctx: Context) {
             #[collect(require_static)]
             struct Codes {
                 s: String,
-                pos: usize,
+                pos: Rc<AtomicUsize>,
             }
 
             impl<'gc> Sequence<'gc> for Codes {
                 fn poll(
-                    mut self: std::pin::Pin<&mut Self>,
+                    self: std::pin::Pin<&mut Self>,
                     ctx: Context<'gc>,
                     _exec: crate::Execution<'gc, '_>,
                     mut stack: crate::Stack<'gc, '_>,
                 ) -> Result<SequencePoll<'gc>, Error<'gc>> {
-                    let position = self.pos;
+                    let position = Rc::clone(&self.pos);
                     let bytes = self.s.as_bytes();
                     let len = bytes.len();
 
-                    if position >= len {
+                    if position.load(Ordering::Relaxed) >= len {
                         stack.replace(ctx, Value::Nil);
                         return Ok(SequencePoll::Return);
                     }
 
-                    let byte = bytes[position];
+                    let byte = bytes[position.load(Ordering::Relaxed)];
 
-                    let expected_bytes = utf8_sequence_length(ctx, byte, position)?;
+                    let expected_bytes =
+                        utf8_sequence_length(ctx, byte, position.load(Ordering::Relaxed))?;
 
-                    validate_utf8_sequence(ctx, position, expected_bytes, bytes)?;
+                    validate_utf8_sequence(
+                        ctx,
+                        position.load(Ordering::Relaxed),
+                        expected_bytes,
+                        bytes,
+                    )?;
 
-                    let code_point = decode_utf8_codepoint(position, expected_bytes, bytes);
+                    let code_point = decode_utf8_codepoint(
+                        position.load(Ordering::Relaxed),
+                        expected_bytes,
+                        bytes,
+                    );
 
                     stack.clear();
-                    stack.into_back(ctx, position as i64 + 1);
+                    stack.into_back(ctx, position.load(Ordering::Relaxed) as i64 + 1);
                     stack.into_back(ctx, code_point as i64);
 
-                    self.pos += expected_bytes;
+                    self.pos.fetch_add(expected_bytes, Ordering::Relaxed);
 
                     Ok(SequencePoll::Return)
                 }
@@ -184,7 +197,7 @@ pub fn load_utf8(ctx: Context) {
 
             let root = Codes {
                 s: s.to_owned(),
-                pos: 0,
+                pos: Rc::new(AtomicUsize::new(0)),
             };
 
             let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| {
@@ -225,29 +238,29 @@ pub fn load_utf8(ctx: Context) {
             let mut char_count = 0;
             let mut position = start;
 
-            while position < end {
+            while position <= end {
+                if position >= len {
+                    break;
+                }
+
                 let byte = bytes[position];
 
                 let expected_bytes = match utf8_sequence_length(ctx, byte, position) {
                     Ok(len) => len,
                     Err(_) => {
                         stack.clear();
-                        stack.push_back(Value::Boolean(false));
-                        stack.push_back(Value::Integer(position as i64 + 1));
+                        stack.into_back(ctx, Value::Nil);
+                        stack.into_back(ctx, position as i64 + 1);
                         return Ok(CallbackReturn::Return);
                     }
                 };
 
-                if position + expected_bytes > end {
-                    break;
-                }
-
                 match validate_utf8_sequence(ctx, position, expected_bytes, bytes) {
                     Ok(_) => {}
                     Err(_) => {
                         stack.clear();
-                        stack.push_back(Value::Boolean(false));
-                        stack.push_back(Value::Integer(position as i64 + 1));
+                        stack.into_back(ctx, Value::Nil);
+                        stack.into_back(ctx, position as i64 + 1);
                         return Ok(CallbackReturn::Return);
                     }
                 }
@@ -265,6 +278,16 @@ pub fn load_utf8(ctx: Context) {
         ctx,
         "codepoint",
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
+            fn is_valid_lua_index(index: i64, length: i64) -> bool {
+                if index == 0 {
+                    false
+                } else if index > 0 {
+                    index <= length
+                } else {
+                    index >= -length
+                }
+            }
+
             let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
             let bytes = s.as_bytes();
             let len = bytes.len();
@@ -272,10 +295,23 @@ pub fn load_utf8(ctx: Context) {
             let i = i.unwrap_or(1);
             let j = j.unwrap_or(i);
 
+            if !is_valid_lua_index(j, len as i64) {
+                return Err("bad argument #3 to 'codepoint' (out of bounds)"
+                    .into_value(ctx)
+                    .into());
+            }
+
+            if !is_valid_lua_index(i, len as i64) {
+                return Err(format!("bad argument #2 to 'codepoint' (out of bounds)",)
+                    .into_value(ctx)
+                    .into());
+            }
+
             let start = adjust_index(i, len);
             let end = adjust_index(j, len);
 
-            if start >= len || end >= len || end < start {
+            if start >= len || end < start {
+                // Return empty result if normalized range is invalid
                 return Ok(CallbackReturn::Return);
             }
 
@@ -321,27 +357,39 @@ pub fn load_utf8(ctx: Context) {
 
             let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 });
 
-            let mut pos = adjust_index(i, len);
+            if i == 0 {
+                return Err("bad argument #3 to 'offset' (position out of bounds)"
+                    .into_value(ctx)
+                    .into());
+            }
+
+            let mut position = adjust_index(i, len);
+
+            if n != 0 && position < len && (bytes[position] & 0xC0) == 0x80 {
+                return Err("initial position is a continuation byte"
+                    .into_value(ctx)
+                    .into());
+            }
 
             if n == 0 {
-                if pos >= len {
+                if position >= len {
                     stack.replace(ctx, Value::Nil);
                     return Ok(CallbackReturn::Return);
                 }
 
-                while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
-                    pos -= 1;
+                while position > 0 && (bytes[position] & 0xC0) == 0x80 {
+                    position -= 1;
                 }
 
-                stack.replace(ctx, (pos as i64) + 1);
+                stack.replace(ctx, (position as i64) + 1);
                 return Ok(CallbackReturn::Return);
             }
 
             if n > 0 {
                 let mut count = 0;
 
-                while count < n && pos < len {
-                    if (bytes[pos] & 0xC0) != 0x80 {
+                while count < n && position < len {
+                    if (bytes[position] & 0xC0) != 0x80 {
                         count += 1;
                     }
 
@@ -349,47 +397,41 @@ pub fn load_utf8(ctx: Context) {
                         break;
                     }
 
-                    pos += 1;
+                    position += 1;
+                }
+
+                if count == n {
+                    stack.replace(ctx, (position as i64) + 1);
+                    return Ok(CallbackReturn::Return);
                 }
 
-                if count == n - 1 && pos == len {
-                    stack.replace(ctx, (pos as i64) + 1);
+                if count == n - 1 && position == len {
+                    stack.replace(ctx, (position as i64) + 1);
                     return Ok(CallbackReturn::Return);
                 } else if count < n {
                     stack.replace(ctx, Value::Nil);
                     return Ok(CallbackReturn::Return);
                 }
             } else if n < 0 {
-                let mut count = 0;
-
-                if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
-                    while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 {
-                        pos -= 1;
-                    }
-                    if pos > 0 {
-                        pos -= 1;
-                    }
-                } else if pos > 0 {
-                    pos -= 1;
-                }
+                let target_count = -n;
+                let mut count = 0i64;
 
-                while count < (-n) && pos > 0 {
-                    pos -= 1;
+                let mut current_byte_index = adjust_index(i, len);
 
-                    while pos > 0 && (bytes[pos] & 0xC0) == 0x80 {
-                        pos -= 1;
+                while count < target_count {
+                    if current_byte_index == 0 {
+                        stack.replace(ctx, Value::Nil);
+                        return Ok(CallbackReturn::Return);
+                    }
+                    current_byte_index -= 1;
+                    if (bytes[current_byte_index] & 0xC0) != 0x80 {
+                        count += 1;
                     }
-
-                    count += 1;
-                }
-
-                if count < (-n) {
-                    stack.replace(ctx, Value::Nil);
-                    return Ok(CallbackReturn::Return);
                 }
+                stack.replace(ctx, (current_byte_index as i64) + 1);
+                return Ok(CallbackReturn::Return);
             }
 
-            stack.replace(ctx, (pos as i64) + 1);
             Ok(CallbackReturn::Return)
         }),
     );
diff --git a/tests/scripts/utf8.lua b/tests/scripts/utf8.lua
new file mode 100644
index 00000000..ef7319be
--- /dev/null
+++ b/tests/scripts/utf8.lua
@@ -0,0 +1,260 @@
+function is_err(f, ...)
+    local status, err = pcall(f, ...)
+    return not status, err
+end
+
+function collect_codes(s)
+    local results = {}
+    local err_status, err_val = pcall(function()
+        for p, c in utf8.codes(s) do
+            table.insert(results, {p, c})
+        end
+    end)
+    if not err_status then
+        return false, err_val
+    end
+    return results
+end
+
+function collect_codepoints(s, i, j)
+    local results = {}
+    local args = {s, i, j}
+    local err_status, err_val = pcall(function()
+        local values = {utf8.codepoint(table.unpack(args))}
+        for _, v in ipairs(values) do
+            table.insert(results, v)
+        end
+    end)
+     if not err_status then
+        return false, err_val
+    end
+    return results
+end
+
+do
+    assert(utf8.char() == "")
+    assert(utf8.char(65) == "A")
+    assert(utf8.char(65, 66, 67) == "ABC")
+    assert(utf8.char(0x41, 0x42, 0x43) == "ABC")
+    assert(utf8.char(1055, 1088, 1080, 1074, 1077, 1090) == "Привет")
+    assert(utf8.char(72, 1080, 33) == "Hи!")
+    assert(utf8.char(0xC2, 0xA2) == "\195\130\194\162")
+    assert(utf8.char(162) == "\194\162")
+    assert(utf8.char(0xE2, 0x82, 0xAC) == "\195\162\194\130\194\172")
+    assert(utf8.char(8364) == "\226\130\172")
+    assert(utf8.char(0xF0, 0x9F, 0x98, 0x80) == "\195\176\194\159\194\152\194\128")
+    assert(utf8.char(128512) == "\240\159\152\128")
+    assert(utf8.char(0) == "\0")
+    assert(utf8.char(65, 0, 66) == "A\0B")
+    assert(utf8.char(0x7F) == "\127")
+    assert(utf8.char(0x80) == "\194\128")
+    assert(utf8.char(0x7FF) == "\223\191")
+    assert(utf8.char(0x800) == "\224\160\128")
+    assert(utf8.char(0xFFFF) == "\239\191\191")
+    assert(utf8.char(0x10000) == "\240\144\128\128")
+    assert(utf8.char(0x10FFFF) == "\244\143\191\191")
+    assert(is_err(utf8.char, "A"))
+    assert(is_err(utf8.char, 65, "B"))
+    assert(is_err(utf8.char, {}))
+    assert(is_err(utf8.char, nil))
+    assert(is_err(utf8.char, -1))
+    assert(is_err(utf8.char, 0x110000))
+    assert(is_err(utf8.char, 0xD800))
+    assert(is_err(utf8.char, 0xDFFF))
+    assert(is_err(utf8.char, 0x110000))
+    assert(is_err(utf8.char, "not a number"))
+end
+
+do
+    assert(utf8.charpattern == "[\\0-\\x7F\\xC2-\\xF4][\\x80-\\xBF]*")
+end
+
+do
+    local empty_codes = collect_codes("")
+    assert(type(empty_codes) == "table" and #empty_codes == 0)
+
+    local abc_codes = collect_codes("ABC")
+    assert(type(abc_codes) == "table" and #abc_codes == 3)
+    assert(abc_codes[1][1] == 1 and abc_codes[1][2] == 65)
+    assert(abc_codes[2][1] == 2 and abc_codes[2][2] == 66)
+    assert(abc_codes[3][1] == 3 and abc_codes[3][2] == 67)
+
+    local ab0c_codes = collect_codes("AB\0C")
+    assert(type(ab0c_codes) == "table" and #ab0c_codes == 4)
+    assert(ab0c_codes[1][1] == 1 and ab0c_codes[1][2] == 65)
+    assert(ab0c_codes[2][1] == 2 and ab0c_codes[2][2] == 66)
+    assert(ab0c_codes[3][1] == 3 and ab0c_codes[3][2] == 0)
+    assert(ab0c_codes[4][1] == 4 and ab0c_codes[4][2] == 67)
+
+    local privet = "Привет"
+    local privet_codes = collect_codes(privet)
+    assert(#privet_codes == 6)
+    assert(privet_codes[1][1] == 1 and privet_codes[1][2] == 1055) 
+    assert(privet_codes[2][1] == 3 and privet_codes[2][2] == 1088) 
+    assert(privet_codes[3][1] == 5 and privet_codes[3][2] == 1080) 
+    assert(privet_codes[4][1] == 7 and privet_codes[4][2] == 1074) 
+    assert(privet_codes[5][1] == 9 and privet_codes[5][2] == 1077) 
+    assert(privet_codes[6][1] == 11 and privet_codes[6][2] == 1090) 
+
+    local hieuro = "Hi€!"
+    local hieuro_codes = collect_codes(hieuro)
+    assert(#hieuro_codes == 4)
+    assert(hieuro_codes[1][1] == 1 and hieuro_codes[1][2] == 72) 
+    assert(hieuro_codes[2][1] == 2 and hieuro_codes[2][2] == 105) 
+    assert(hieuro_codes[3][1] == 3 and hieuro_codes[3][2] == 8364) 
+    assert(hieuro_codes[4][1] == 6 and hieuro_codes[4][2] == 33) 
+
+    local emoji = "😀"
+    local emoji_codes = collect_codes(emoji)
+    assert(#emoji_codes == 1)
+    assert(emoji_codes[1][1] == 1 and emoji_codes[1][2] == 128512)
+
+    assert(collect_codes("abc\xE2\x82") == false)
+    assert(collect_codes("abc\xE2\x82\xFF") == false)
+    assert(collect_codes("abc\xFF") == false)
+    assert(collect_codes("\xC0\x80") == false)
+end
+
+do
+    local s = "ABC"
+    assert(table.concat(collect_codepoints(s), ",") == "65")
+    assert(table.concat(collect_codepoints(s, 1), ",") == "65")
+    assert(table.concat(collect_codepoints(s, 2), ",") == "66")
+    assert(table.concat(collect_codepoints(s, 3), ",") == "67")
+    assert(collect_codepoints(s, 4) == false)
+    assert(table.concat(collect_codepoints(s, 1, 1), ",") == "65")
+    assert(table.concat(collect_codepoints(s, 1, 2), ",") == "65,66")
+    assert(table.concat(collect_codepoints(s, 1, 3), ",") == "65,66,67")
+    assert(table.concat(collect_codepoints(s, 2, 3), ",") == "66,67")
+    assert(table.concat(collect_codepoints(s, 3, 3), ",") == "67")
+    assert(collect_codepoints(s, 1, 10) == false)
+    assert(table.concat(collect_codepoints(s, 3, 1), ",") == "")
+    assert(table.concat(collect_codepoints(s, -1), ",") == "67")
+    assert(table.concat(collect_codepoints(s, -2), ",") == "66")
+    assert(table.concat(collect_codepoints(s, -3), ",") == "65")
+    assert(table.concat(collect_codepoints(s, -3, -1), ",") == "65,66,67")
+    assert(table.concat(collect_codepoints(s, -2, -1), ",") == "66,67")
+    assert(table.concat(collect_codepoints(s, -1, -1), ",") == "67")
+    assert(table.concat(collect_codepoints(s, 1, -1), ",") == "65,66,67")
+    assert(table.concat(collect_codepoints(s, 2, -1), ",") == "66,67")
+    assert(table.concat(collect_codepoints(s, 1, -2), ",") == "65,66")
+    assert(table.concat(collect_codepoints(s, -3, 3), ",") == "65,66,67")
+    assert(table.concat(collect_codepoints(s, -3, 1), ",") == "65")
+
+    local privet = "Привет"
+    assert(table.concat(collect_codepoints(privet, 1), ",") == "1055")
+    assert(table.concat(collect_codepoints(privet, 1), ",") == "1055")
+    assert(table.concat(collect_codepoints(privet, 3), ",") == "1088")
+    assert(table.concat(collect_codepoints(privet, 1, 2), ",") == "1055")
+    assert(table.concat(collect_codepoints(privet, 1, 3), ",") == "1055,1088")
+    assert(table.concat(collect_codepoints(privet, 1, 4), ",") == "1055,1088")
+    assert(table.concat(collect_codepoints(privet, 1, 12), ",") == "1055,1088,1080,1074,1077,1090")
+    assert(table.concat(collect_codepoints(privet, 3, 7), ",") == "1088,1080,1074")
+    assert(table.concat(collect_codepoints(privet, -2, -1), ",") == "1090")
+    assert(table.concat(collect_codepoints(privet, 11, -1), ",") == "1090")
+    assert(table.concat(collect_codepoints(privet, 1, -1), ",") == "1055,1088,1080,1074,1077,1090")
+    assert(collect_codepoints("", 1, 1) == false)
+
+    local emoji = "😀"
+    assert(table.concat(collect_codepoints(emoji), ",") == "128512")
+    assert(table.concat(collect_codepoints(emoji, 1), ",") == "128512")
+    assert(table.concat(collect_codepoints(emoji, 1), ",") == "128512")
+    assert(table.concat(collect_codepoints(emoji, 1, 4), ",") == "128512")
+    assert(collect_codepoints("abc\xE2\x82", 1) == false)
+    assert(collect_codepoints("abc\xE2\x82\xFF", 1) == false)
+    assert(collect_codepoints("abc\xFF", 1) == false)
+    assert(collect_codepoints("abc\xFF", 4) == false)
+    assert(collect_codepoints("abc\xE2\x82", 1, 5) == false)
+    assert(collect_codepoints("abc\xE2\x20\xAC", 1, 6) == false)
+end
+
+do
+    assert(utf8.len("") == 0)
+    assert(utf8.len("ABC") == 3)
+    assert(utf8.len("При") == 3)
+    assert(utf8.len("Привет") == 6)
+    assert(utf8.len("😀") == 1)
+    assert(utf8.len("A😀B") == 3)
+    assert(utf8.len("A\0B") == 3)
+
+    local s = "Привет"
+    assert(utf8.len(s, 1, 1) == 1)
+    assert(utf8.len(s, 1, 2) == 1)
+    assert(utf8.len(s, 1, 3) == 2)
+    assert(utf8.len(s, 1, 4) == 2)
+    assert(utf8.len(s, 3, 4) == 1)
+    assert(utf8.len(s, 3, 6) == 2)
+    assert(utf8.len(s, 1, 12) == 6)
+    assert(utf8.len(s, 1, -1) == 6)
+    assert(utf8.len(s, -12, -1) == 6)
+    assert(utf8.len(s, -2, -1) == 1)
+    assert(utf8.len(s, 11, 12) == 1)
+    assert(utf8.len(s, 1, 6) == 3)
+    assert(utf8.len(s, 7, 12) == 3)
+    assert(utf8.len(s, 13, 20) == 0)
+    assert(utf8.len(s, 5, 1) == 0)
+    assert(utf8.len(s, 1, 11) == 6)
+end
+
+do
+    local s = "Привет"
+    assert(utf8.offset(s, 0) == 1)
+    assert(utf8.offset(s, 1) == 1)
+    assert(utf8.offset(s, 2) == 3)
+    assert(utf8.offset(s, 6) == 11)
+    assert(utf8.offset(s, 7) == 13)
+    assert(utf8.offset(s, 8) == nil)
+    assert(utf8.offset(s, -1) == 11)
+    assert(utf8.offset(s, -2) == 9)
+    assert(utf8.offset(s, -6) == 1)
+    assert(utf8.offset(s, -7) == nil)
+    assert(utf8.offset(s, 1, 1) == 1)
+    assert(is_err(utf8.offset, s, 1, 2))
+    assert(utf8.offset(s, 1, 3) == 3)
+    assert(utf8.offset(s, 2, 3) == 5)
+    assert(utf8.offset(s, 1, 11) == 11)
+    assert(is_err(utf8.offset, s, 1, 12))
+    assert(utf8.offset(s, 1, 13) == 13)
+    assert(utf8.offset(s, 2, 11) == 13)
+    assert(is_err(utf8.offset, s, 2, 12))
+    assert(is_err(utf8.offset, s, -1, 12))
+    assert(utf8.offset(s, -1, 11) == 9)
+    assert(utf8.offset(s, -1, 3) == 1)
+    assert(is_err(utf8.offset, s, -1, 2))
+    assert(utf8.offset(s, -1, 1) == nil)
+    assert(is_err(utf8.offset, s, -2, 12))
+    assert(is_err(utf8.offset, s, -6, 12))
+    assert(is_err(utf8.offset, s, -7, 12))
+    assert(utf8.offset(s, -1, #s + 1) == 11)
+    assert(utf8.offset(s, 0, 1) == 1)
+    assert(utf8.offset(s, 0, 2) == 1)
+    assert(utf8.offset(s, 0, 3) == 3)
+    assert(utf8.offset(s, 0, 4) == 3)
+    assert(utf8.offset(s, 0, 11) == 11)
+    assert(utf8.offset(s, 0, 12) == 11)
+    assert(utf8.offset(s, 0, 13) == nil)
+    assert(is_err(utf8.offset, s, 0, 0))
+    assert(utf8.offset(s, 0, -1) == 11)
+    assert(utf8.offset(s, 0, -12) == 1)
+
+    local ascii = "ABCDEFG"
+    assert(utf8.offset(ascii, 3, 1) == 3)
+    assert(utf8.offset(ascii, -3, 7) == 4)
+    assert(utf8.offset(ascii, 0, 5) == 5)
+
+    local emoji = "A😀B"
+    assert(utf8.offset(emoji, 1) == 1)
+    assert(utf8.offset(emoji, 2) == 2)
+    assert(utf8.offset(emoji, 3) == 6)
+    assert(utf8.offset(emoji, 4) == 7)
+    assert(utf8.offset(emoji, -1) == 6)
+    assert(utf8.offset(emoji, -2) == 2)
+    assert(utf8.offset(emoji, -3) == 1)
+    assert(utf8.offset(emoji, 0, 1) == 1)
+    assert(utf8.offset(emoji, 0, 2) == 2)
+    assert(utf8.offset(emoji, 0, 3) == 2)
+    assert(utf8.offset(emoji, 0, 4) == 2)
+    assert(utf8.offset(emoji, 0, 5) == 2)
+    assert(utf8.offset(emoji, 0, 6) == 6)
+    assert(utf8.offset(emoji, 0, 7) == nil)
+end

From 2c0320c1db3fcc334458865cba5d8df393eba9f6 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Mon, 2 Jun 2025 17:49:51 +0500
Subject: [PATCH 04/12] `refactor`: simplify `utf8.codes`

---
 src/stdlib/utf8.rs | 91 +++++++++++++++-------------------------------
 1 file changed, 30 insertions(+), 61 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index cd85c04d..5e96536b 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -1,10 +1,5 @@
-use std::{rc::Rc, sync::atomic::AtomicUsize, sync::atomic::Ordering};
-
-use gc_arena::Collect;
-
 use crate::{
-    BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll,
-    Table, Value,
+    Callback, CallbackReturn, Context, Error, IntoValue, String as LuaString, Table, Value,
 };
 
 fn utf8_sequence_length<'gc>(
@@ -142,72 +137,46 @@ pub fn load_utf8(ctx: Context) {
         ctx,
         "codes",
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
-            #[derive(Collect, Clone)]
-            #[collect(require_static)]
-            struct Codes {
-                s: String,
-                pos: Rc<AtomicUsize>,
-            }
+            let s = stack.consume::<LuaString>(ctx)?;
 
-            impl<'gc> Sequence<'gc> for Codes {
-                fn poll(
-                    self: std::pin::Pin<&mut Self>,
-                    ctx: Context<'gc>,
-                    _exec: crate::Execution<'gc, '_>,
-                    mut stack: crate::Stack<'gc, '_>,
-                ) -> Result<SequencePoll<'gc>, Error<'gc>> {
-                    let position = Rc::clone(&self.pos);
-                    let bytes = self.s.as_bytes();
-                    let len = bytes.len();
-
-                    if position.load(Ordering::Relaxed) >= len {
-                        stack.replace(ctx, Value::Nil);
-                        return Ok(SequencePoll::Return);
-                    }
+            let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| {
+                let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?;
 
-                    let byte = bytes[position.load(Ordering::Relaxed)];
+                let s = s.to_str()?;
+                let n = (n - 1) as usize;
 
-                    let expected_bytes =
-                        utf8_sequence_length(ctx, byte, position.load(Ordering::Relaxed))?;
+                if n >= s.len() {
+                    stack.replace(ctx, (Value::Nil, Value::Nil));
+                    return Ok(CallbackReturn::Return);
+                }
 
-                    validate_utf8_sequence(
-                        ctx,
-                        position.load(Ordering::Relaxed),
-                        expected_bytes,
-                        bytes,
-                    )?;
+                let bytes = &s.as_bytes()[n..];
 
-                    let code_point = decode_utf8_codepoint(
-                        position.load(Ordering::Relaxed),
-                        expected_bytes,
-                        bytes,
-                    );
+                let mut chunks = bytes.utf8_chunks();
 
-                    stack.clear();
-                    stack.into_back(ctx, position.load(Ordering::Relaxed) as i64 + 1);
-                    stack.into_back(ctx, code_point as i64);
+                if let Some(chunk) = chunks.next() {
+                    if !chunk.invalid().is_empty() {
+                        return Err("Invalid UTF-8 byte sequence".into_value(ctx).into());
+                    }
 
-                    self.pos.fetch_add(expected_bytes, Ordering::Relaxed);
+                    if let Some(c) = chunk.valid().chars().next() {
+                        let len = c.len_utf8();
+                        let p = n + len;
+                        let p = (p + 1) as i64;
 
-                    Ok(SequencePoll::Return)
+                        stack.replace(ctx, (p, c as i64));
+                        Ok(CallbackReturn::Return)
+                    } else {
+                        stack.replace(ctx, (Value::Nil, Value::Nil));
+                        Ok(CallbackReturn::Return)
+                    }
+                } else {
+                    stack.replace(ctx, (Value::Nil, Value::Nil));
+                    Ok(CallbackReturn::Return)
                 }
-            }
-
-            let s = stack.consume::<String>(ctx)?;
-
-            let root = Codes {
-                s: s.to_owned(),
-                pos: Rc::new(AtomicUsize::new(0)),
-            };
-
-            let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| {
-                Ok(CallbackReturn::Sequence(BoxSequence::new(
-                    &ctx,
-                    root.clone(),
-                )))
             });
 
-            stack.replace(ctx, codes);
+            stack.replace(ctx, (callback, s, 1));
 
             Ok(CallbackReturn::Return)
         }),

From eccc6fa524c9a2c82e6b7f15b4e7e8ce180c4813 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Mon, 2 Jun 2025 20:39:54 +0500
Subject: [PATCH 05/12] `fix`: index conversion

---
 src/stdlib/utf8.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 5e96536b..24ff666c 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -143,7 +143,7 @@ pub fn load_utf8(ctx: Context) {
                 let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?;
 
                 let s = s.to_str()?;
-                let n = (n - 1) as usize;
+                let n = adjust_index(n, s.len());
 
                 if n >= s.len() {
                     stack.replace(ctx, (Value::Nil, Value::Nil));

From e7439757d5d8ce8808de3403431e9ccd82a947ff Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Mon, 2 Jun 2025 22:57:44 +0500
Subject: [PATCH 06/12] `fix`: try to fix indexing bug

---
 src/stdlib/utf8.rs | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 24ff666c..3c0dcccc 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -139,11 +139,17 @@ pub fn load_utf8(ctx: Context) {
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
             let s = stack.consume::<LuaString>(ctx)?;
 
-            let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| {
+            let callback = Callback::from_fn_with(&ctx, None, |first_call, ctx, _, mut stack| {
                 let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?;
 
+                if n == 0 {
+                    stack.into_back(ctx, 1);
+                } else {
+                    stack.into_back(ctx, n + 1);
+                }
+
                 let s = s.to_str()?;
-                let n = adjust_index(n, s.len());
+                let n = n as usize;
 
                 if n >= s.len() {
                     stack.replace(ctx, (Value::Nil, Value::Nil));
@@ -160,11 +166,13 @@ pub fn load_utf8(ctx: Context) {
                     }
 
                     if let Some(c) = chunk.valid().chars().next() {
-                        let len = c.len_utf8();
-                        let p = n + len;
-                        let p = (p + 1) as i64;
-
-                        stack.replace(ctx, (p, c as i64));
+                        if c.is_ascii() {
+                            stack.into_back(ctx, c as i64);
+                        } else {
+                            let len = c.len_utf8();
+                            let n = stack.consume::<i64>(ctx)?;
+                            stack.replace(ctx, (n + len as i64, c as i64));
+                        }
                         Ok(CallbackReturn::Return)
                     } else {
                         stack.replace(ctx, (Value::Nil, Value::Nil));
@@ -176,7 +184,7 @@ pub fn load_utf8(ctx: Context) {
                 }
             });
 
-            stack.replace(ctx, (callback, s, 1));
+            stack.replace(ctx, (callback, s, 0));
 
             Ok(CallbackReturn::Return)
         }),

From b23cbb52268a6c586dd59e09e60d7ce757fc3065 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Mon, 2 Jun 2025 22:59:40 +0500
Subject: [PATCH 07/12] `fix`: compile error

---
 src/stdlib/utf8.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 3c0dcccc..4ec21e8f 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -139,7 +139,7 @@ pub fn load_utf8(ctx: Context) {
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
             let s = stack.consume::<LuaString>(ctx)?;
 
-            let callback = Callback::from_fn_with(&ctx, None, |first_call, ctx, _, mut stack| {
+            let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| {
                 let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?;
 
                 if n == 0 {

From b214a11eef506d4b3eec9b248529f86cb1a6907e Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Mon, 2 Jun 2025 23:06:46 +0500
Subject: [PATCH 08/12] `fix`: indexing bug

---
 src/stdlib/utf8.rs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 4ec21e8f..975d05a4 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -166,12 +166,15 @@ pub fn load_utf8(ctx: Context) {
                     }
 
                     if let Some(c) = chunk.valid().chars().next() {
-                        if c.is_ascii() {
-                            stack.into_back(ctx, c as i64);
+                        if n == 0 {
+                            stack.replace(ctx, (1, c as i64));
                         } else {
-                            let len = c.len_utf8();
-                            let n = stack.consume::<i64>(ctx)?;
-                            stack.replace(ctx, (n + len as i64, c as i64));
+                            if c.is_ascii() {
+                                stack.replace(ctx, (n as i64 + 1, c as i64));
+                            } else {
+                                let len = c.len_utf8();
+                                stack.replace(ctx, ((n + len) as i64, c as i64));
+                            }
                         }
                         Ok(CallbackReturn::Return)
                     } else {

From e01396d7a9f4e463ced95221771a008c33276543 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Mon, 2 Jun 2025 23:08:05 +0500
Subject: [PATCH 09/12] `fix`: remove useless if branch

---
 src/stdlib/utf8.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 975d05a4..28fa5ccd 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -142,12 +142,6 @@ pub fn load_utf8(ctx: Context) {
             let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| {
                 let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?;
 
-                if n == 0 {
-                    stack.into_back(ctx, 1);
-                } else {
-                    stack.into_back(ctx, n + 1);
-                }
-
                 let s = s.to_str()?;
                 let n = n as usize;
 

From a787394af6ccfdf2e791d443fe629014673189fe Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Tue, 3 Jun 2025 21:09:37 +0500
Subject: [PATCH 10/12] `refactor`: simplify `utf8.len`

---
 src/stdlib/utf8.rs | 73 +++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 52 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 28fa5ccd..50d986d8 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -78,14 +78,6 @@ fn adjust_index(index: i64, len: usize) -> usize {
     }
 }
 
-fn calculate_string_range(start: usize, end: usize, len: usize) -> Option<(usize, usize)> {
-    if start >= len || (end < start && end != 0) {
-        None
-    } else {
-        Some((start, end.min(len)))
-    }
-}
-
 pub fn load_utf8(ctx: Context) {
     let utf8 = Table::new(&ctx);
 
@@ -142,15 +134,15 @@ pub fn load_utf8(ctx: Context) {
             let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| {
                 let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?;
 
-                let s = s.to_str()?;
+                let bytes = s.as_bytes();
                 let n = n as usize;
 
-                if n >= s.len() {
+                if n >= bytes.len() {
                     stack.replace(ctx, (Value::Nil, Value::Nil));
                     return Ok(CallbackReturn::Return);
                 }
 
-                let bytes = &s.as_bytes()[n..];
+                let bytes = &bytes[n..];
 
                 let mut chunks = bytes.utf8_chunks();
 
@@ -192,58 +184,35 @@ pub fn load_utf8(ctx: Context) {
         "len",
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
             let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
-            let bytes = s.as_bytes();
-            let len = bytes.len();
-
-            let i = i.unwrap_or(1);
-            let j = j.unwrap_or(-1);
 
-            let start = adjust_index(i, len);
-            let end = adjust_index(j, len);
-
-            let (start, end) = match calculate_string_range(start, end, len) {
-                Some(range) => range,
-                None => {
-                    stack.replace(ctx, 0);
+            let s = match std::str::from_utf8(s.as_bytes()) {
+                Ok(s) => s,
+                Err(err) => {
+                    let position = err.error_len().unwrap_or_default();
+                    stack.replace(ctx, (false, position as i64 + 1));
                     return Ok(CallbackReturn::Return);
                 }
             };
 
-            let mut char_count = 0;
-            let mut position = start;
+            let len = s.len();
 
-            while position <= end {
-                if position >= len {
-                    break;
-                }
+            let i = i.unwrap_or(1);
+            let j = j.unwrap_or(-1);
 
-                let byte = bytes[position];
+            let start = adjust_index(i, len);
+            let end = adjust_index(j, len);
 
-                let expected_bytes = match utf8_sequence_length(ctx, byte, position) {
-                    Ok(len) => len,
-                    Err(_) => {
-                        stack.clear();
-                        stack.into_back(ctx, Value::Nil);
-                        stack.into_back(ctx, position as i64 + 1);
-                        return Ok(CallbackReturn::Return);
-                    }
-                };
+            if start >= len || (end < start && end != 0) {
+                stack.replace(ctx, 0);
+                return Ok(CallbackReturn::Return);
+            }
 
-                match validate_utf8_sequence(ctx, position, expected_bytes, bytes) {
-                    Ok(_) => {}
-                    Err(_) => {
-                        stack.clear();
-                        stack.into_back(ctx, Value::Nil);
-                        stack.into_back(ctx, position as i64 + 1);
-                        return Ok(CallbackReturn::Return);
-                    }
-                }
+            let end = end.min(len);
 
-                char_count += 1;
-                position += expected_bytes;
-            }
+            let s = &s[start..=end];
+
+            stack.replace(ctx, s.chars().count() as i64);
 
-            stack.replace(ctx, char_count);
             Ok(CallbackReturn::Return)
         }),
     );

From 0b5b72cde512d8a58fa7b63744961e7d19e00792 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Tue, 3 Jun 2025 22:07:43 +0500
Subject: [PATCH 11/12] `refactor`: simplify `utf8.codepoint`

---
 src/stdlib/utf8.rs | 179 +++++++++++----------------------------------
 1 file changed, 42 insertions(+), 137 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 50d986d8..9eb9e77b 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -1,81 +1,20 @@
-use crate::{
-    Callback, CallbackReturn, Context, Error, IntoValue, String as LuaString, Table, Value,
-};
-
-fn utf8_sequence_length<'gc>(
-    ctx: Context<'gc>,
-    byte: u8,
-    position: usize,
-) -> Result<usize, Error<'gc>> {
-    if byte & 0x80 == 0 {
-        Ok(1)
-    } else if byte & 0xE0 == 0xC0 {
-        Ok(2)
-    } else if byte & 0xF0 == 0xE0 {
-        Ok(3)
-    } else if byte & 0xF8 == 0xF0 {
-        Ok(4)
-    } else {
-        Err(
-            format!("invalid UTF-8 sequence at position {}", position + 1)
-                .into_value(ctx)
-                .into(),
-        )
-    }
+use crate::{Callback, CallbackReturn, Context, IntoValue, String as LuaString, Table, Value};
+
+fn convert_index(i: i64, len: usize) -> Option<usize> {
+    let val = match i {
+        0 => 0,
+        v @ 1.. => v - 1,
+        v @ ..=-1 => (len as i64 + v).max(0),
+    };
+    usize::try_from(val).ok()
 }
 
-fn validate_utf8_sequence<'gc>(
-    ctx: Context<'gc>,
-    position: usize,
-    expected_bytes: usize,
-    bytes: &[u8],
-) -> Result<(), Error<'gc>> {
-    if position + expected_bytes > bytes.len() {
-        return Err(
-            format!("incomplete UTF-8 code at position {}", position + 1)
-                .into_value(ctx)
-                .into(),
-        );
-    }
-
-    for i in 1..expected_bytes {
-        if bytes[position + i] & 0xC0 != 0x80 {
-            return Err(format!("invalid UTF-8 code at position {}", position + 1)
-                .into_value(ctx)
-                .into());
-        }
-    }
-
-    Ok(())
-}
-
-fn decode_utf8_codepoint(position: usize, expected_bytes: usize, bytes: &[u8]) -> u32 {
-    match expected_bytes {
-        1 => bytes[position] as u32,
-        2 => ((bytes[position] & 0x1F) as u32) << 6 | ((bytes[position + 1] & 0x3F) as u32),
-        3 => {
-            ((bytes[position] & 0x0F) as u32) << 12
-                | ((bytes[position + 1] & 0x3F) as u32) << 6
-                | ((bytes[position + 2] & 0x3F) as u32)
-        }
-        4 => {
-            ((bytes[position] & 0x07) as u32) << 18
-                | ((bytes[position + 1] & 0x3F) as u32) << 12
-                | ((bytes[position + 2] & 0x3F) as u32) << 6
-                | ((bytes[position + 3] & 0x3F) as u32)
-        }
-        _ => unreachable!(), // this should never happen!!
-    }
-}
-
-fn adjust_index(index: i64, len: usize) -> usize {
-    if index > 0 {
-        index.saturating_sub(1) as usize
-    } else if index < 0 {
-        len.saturating_sub(index.unsigned_abs() as usize)
-    } else {
-        0
-    }
+fn convert_index_end(i: i64, len: usize) -> Option<usize> {
+    let val = match i {
+        v @ 0.. => v,
+        v @ ..=-1 => (len as i64 + v + 1).max(0),
+    };
+    usize::try_from(val).ok()
 }
 
 pub fn load_utf8(ctx: Context) {
@@ -193,22 +132,19 @@ pub fn load_utf8(ctx: Context) {
                     return Ok(CallbackReturn::Return);
                 }
             };
-
             let len = s.len();
 
-            let i = i.unwrap_or(1);
-            let j = j.unwrap_or(-1);
-
-            let start = adjust_index(i, len);
-            let end = adjust_index(j, len);
+            let start = convert_index(i.unwrap_or(1), len).unwrap_or(usize::MAX);
+            let end = convert_index_end(j.unwrap_or(len as i64), len)
+                .unwrap_or(usize::MAX)
+                .min(len);
 
+            // TODO: we need to check this conditions
             if start >= len || (end < start && end != 0) {
                 stack.replace(ctx, 0);
                 return Ok(CallbackReturn::Return);
             }
 
-            let end = end.min(len);
-
             let s = &s[start..=end];
 
             stack.replace(ctx, s.chars().count() as i64);
@@ -221,70 +157,39 @@ pub fn load_utf8(ctx: Context) {
         ctx,
         "codepoint",
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
-            fn is_valid_lua_index(index: i64, length: i64) -> bool {
-                if index == 0 {
-                    false
-                } else if index > 0 {
-                    index <= length
-                } else {
-                    index >= -length
-                }
-            }
-
             let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
-            let bytes = s.as_bytes();
-            let len = bytes.len();
+
+            let s = std::str::from_utf8(s.as_bytes()).map_err(|err| {
+                format!(
+                    "bad argument #1 to 'codepoint' (invalid byte sequence at {})",
+                    err.error_len().unwrap_or_default()
+                )
+                .into_value(ctx)
+            })?;
+            let len = s.len();
 
             let i = i.unwrap_or(1);
             let j = j.unwrap_or(i);
 
-            if !is_valid_lua_index(j, len as i64) {
-                return Err("bad argument #3 to 'codepoint' (out of bounds)"
-                    .into_value(ctx)
-                    .into());
-            }
+            let start = convert_index(i, len).unwrap_or(usize::MAX);
+            let end = convert_index_end(j, len).unwrap_or(usize::MAX).min(len);
 
-            if !is_valid_lua_index(i, len as i64) {
-                return Err(format!("bad argument #2 to 'codepoint' (out of bounds)",)
-                    .into_value(ctx)
-                    .into());
+            if start > len {
+                stack.replace(ctx, Value::Nil);
+                return Ok(CallbackReturn::Return);
             }
 
-            let start = adjust_index(i, len);
-            let end = adjust_index(j, len);
+            if start < 1 {
+                return Err("bad argument #2 (out of range)".into_value(ctx).into());
+            }
 
-            if start >= len || end < start {
-                // Return empty result if normalized range is invalid
+            if start > end {
                 return Ok(CallbackReturn::Return);
             }
 
-            let mut position = start;
-            let mut codepoints = Vec::new();
-
-            while position <= end {
-                if position >= len {
-                    break;
-                }
-
-                let byte = bytes[position];
-
-                let expected_bytes = utf8_sequence_length(ctx, byte, position)?;
-
-                validate_utf8_sequence(ctx, position, expected_bytes, bytes)?;
-
-                let code_point = decode_utf8_codepoint(position, expected_bytes, bytes);
-
-                if position <= end {
-                    codepoints.push(code_point as i64);
-                }
-
-                position += expected_bytes;
-            }
+            let s = &s[start..=end];
 
-            stack.clear();
-            for codepoint in codepoints {
-                stack.push_back(Value::Integer(codepoint));
-            }
+            stack.extend(s.chars().map(|c| Value::Integer(c as i64)));
 
             Ok(CallbackReturn::Return)
         }),
@@ -306,7 +211,7 @@ pub fn load_utf8(ctx: Context) {
                     .into());
             }
 
-            let mut position = adjust_index(i, len);
+            let mut position = convert_index(i, len).unwrap_or(usize::MAX);
 
             if n != 0 && position < len && (bytes[position] & 0xC0) == 0x80 {
                 return Err("initial position is a continuation byte"
@@ -359,7 +264,7 @@ pub fn load_utf8(ctx: Context) {
                 let target_count = -n;
                 let mut count = 0i64;
 
-                let mut current_byte_index = adjust_index(i, len);
+                let mut current_byte_index = convert_index(i, len).unwrap_or(usize::MAX);
 
                 while count < target_count {
                     if current_byte_index == 0 {

From ebe528eef2df248b39debb2edc32b983a5a4e7e4 Mon Sep 17 00:00:00 2001
From: lyranowl <nikita.malina23@gmail.com>
Date: Sun, 10 Aug 2025 20:51:14 +0500
Subject: [PATCH 12/12] `fix`: recommendations for change have been implemented

---
 src/stdlib/utf8.rs | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs
index 9eb9e77b..8e3fe99c 100644
--- a/src/stdlib/utf8.rs
+++ b/src/stdlib/utf8.rs
@@ -94,12 +94,8 @@ pub fn load_utf8(ctx: Context) {
                         if n == 0 {
                             stack.replace(ctx, (1, c as i64));
                         } else {
-                            if c.is_ascii() {
-                                stack.replace(ctx, (n as i64 + 1, c as i64));
-                            } else {
-                                let len = c.len_utf8();
-                                stack.replace(ctx, ((n + len) as i64, c as i64));
-                            }
+                            let len = c.len_utf8();
+                            stack.replace(ctx, ((n + len) as i64, c as i64));
                         }
                         Ok(CallbackReturn::Return)
                     } else {
@@ -124,14 +120,6 @@ pub fn load_utf8(ctx: Context) {
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
             let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
 
-            let s = match std::str::from_utf8(s.as_bytes()) {
-                Ok(s) => s,
-                Err(err) => {
-                    let position = err.error_len().unwrap_or_default();
-                    stack.replace(ctx, (false, position as i64 + 1));
-                    return Ok(CallbackReturn::Return);
-                }
-            };
             let len = s.len();
 
             let start = convert_index(i.unwrap_or(1), len).unwrap_or(usize::MAX);
@@ -147,6 +135,15 @@ pub fn load_utf8(ctx: Context) {
 
             let s = &s[start..=end];
 
+            let s = match std::str::from_utf8(s.as_bytes()) {
+                Ok(s) => s,
+                Err(err) => {
+                    let position = err.error_len().unwrap_or_default();
+                    stack.replace(ctx, (false, position as i64 + 1));
+                    return Ok(CallbackReturn::Return);
+                }
+            };
+
             stack.replace(ctx, s.chars().count() as i64);
 
             Ok(CallbackReturn::Return)
@@ -159,13 +156,6 @@ pub fn load_utf8(ctx: Context) {
         Callback::from_fn(&ctx, |ctx, _, mut stack| {
             let (s, i, j) = stack.consume::<(String, Option<i64>, Option<i64>)>(ctx)?;
 
-            let s = std::str::from_utf8(s.as_bytes()).map_err(|err| {
-                format!(
-                    "bad argument #1 to 'codepoint' (invalid byte sequence at {})",
-                    err.error_len().unwrap_or_default()
-                )
-                .into_value(ctx)
-            })?;
             let len = s.len();
 
             let i = i.unwrap_or(1);
@@ -189,6 +179,14 @@ pub fn load_utf8(ctx: Context) {
 
             let s = &s[start..=end];
 
+            let s = std::str::from_utf8(s.as_bytes()).map_err(|err| {
+                format!(
+                    "bad argument #1 to 'codepoint' (invalid byte sequence at {})",
+                    err.error_len().unwrap_or_default()
+                )
+                .into_value(ctx)
+            })?;
+
             stack.extend(s.chars().map(|c| Value::Integer(c as i64)));
 
             Ok(CallbackReturn::Return)