From 5f895f90e069832360c8bb78795611e70bbe33c6 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Fri, 2 May 2025 23:49:37 +0600 Subject: [PATCH 01/12] `feature`: implemented `utf8.char`, `utf8.charpattern`, `utf8.codes`, `utf8.len`, `utf8.codepoint` and `utf8.offset` --- src/stdlib/mod.rs | 3 +- src/stdlib/utf8.rs | 351 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 353 insertions(+), 1 deletion(-) create mode 100644 src/stdlib/utf8.rs diff --git a/src/stdlib/mod.rs b/src/stdlib/mod.rs index aa766153..d7695ee4 100644 --- a/src/stdlib/mod.rs +++ b/src/stdlib/mod.rs @@ -4,8 +4,9 @@ mod io; mod math; mod string; mod table; +mod utf8; pub use self::{ base::load_base, coroutine::load_coroutine, io::load_io, math::load_math, string::load_string, - table::load_table, + table::load_table, utf8::load_utf8, }; diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs new file mode 100644 index 00000000..b2372081 --- /dev/null +++ b/src/stdlib/utf8.rs @@ -0,0 +1,351 @@ +use gc_arena::Collect; + +use crate::{BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll, Table, Value}; + +fn utf8_sequence_length<'gc>(ctx: Context<'gc>, byte: u8, position: usize) -> Result> { + if byte & 0x80 == 0 { + Ok(1) + } else if byte & 0xE0 == 0xC0 { + Ok(2) + } else if byte & 0xF0 == 0xE0 { + Ok(3) + } else if byte & 0xF8 == 0xF0 { + Ok(4) + } else { + Err(format!("invalid UTF-8 sequence at position {}", position + 1) + .into_value(ctx).into()) + } +} + +fn validate_utf8_sequence<'gc>(ctx: Context<'gc>, position: usize, expected_bytes: usize, bytes: &[u8]) -> Result<(), Error<'gc>> { + if position + expected_bytes > bytes.len() { + return Err(format!("incomplete UTF-8 code at position {}", position + 1) + .into_value(ctx).into()); + } + + for i in 1..expected_bytes { + if bytes[position + i] & 0xC0 != 0x80 { + return Err(format!("invalid UTF-8 code at position {}", position + 1) + .into_value(ctx).into()); + } + } + + Ok(()) +} + +fn decode_utf8_codepoint(position: usize, expected_bytes: usize, bytes: &[u8]) -> u32 { + match expected_bytes { + 1 => bytes[position] as u32, + 2 => { + ((bytes[position] & 0x1F) as u32) << 6 | + ((bytes[position + 1] & 0x3F) as u32) + } + 3 => { + ((bytes[position] & 0x0F) as u32) << 12 | + ((bytes[position + 1] & 0x3F) as u32) << 6 | + ((bytes[position + 2] & 0x3F) as u32) + } + 4 => { + ((bytes[position] & 0x07) as u32) << 18 | + ((bytes[position + 1] & 0x3F) as u32) << 12 | + ((bytes[position + 2] & 0x3F) as u32) << 6 | + ((bytes[position + 3] & 0x3F) as u32) + } + _ => unreachable!() // this should never happen!! + } +} + +fn adjust_index(index: i64, len: usize) -> usize { + if index > 0 { + index.saturating_sub(1) as usize + } else if index < 0 { + len.saturating_sub(index.unsigned_abs() as usize) + } else { + 0 + } +} + +fn calculate_string_range(start: usize, end: usize, len: usize) -> Option<(usize, usize)> { + if start >= len || (end < start && end != 0) { + None + } else { + Some((start, end.min(len))) + } +} + +pub fn load_utf8(ctx: Context) { + let utf8 = Table::new(&ctx); + + utf8.set_field(ctx, "char", Callback::from_fn(&ctx, |ctx, _, mut stack| { + let mut bytes = Vec::with_capacity(stack.len() * 4); + let iter = stack.into_iter().enumerate(); + + for (idx, i) in iter { + let code = match i.to_integer() { + Some(code) => code as u32, + None => { + return Err(format!("bad argument #{} to 'char' (number expected, got {})", + idx + 1, i.type_name()).into_value(ctx).into()) + } + }; + + if let Some(c) = char::from_u32(code) { + let mut buf = [0; 4]; + let utf8_bytes = c.encode_utf8(&mut buf).as_bytes(); + bytes.extend_from_slice(utf8_bytes); + } else { + return Err(format!("bad argument #{} to 'char' (value out of range)", + idx + 1).into_value(ctx).into()) + } + } + + let result = ctx.intern(&bytes); + stack.replace(ctx, result); + + Ok(CallbackReturn::Return) + })); + + let _ = utf8.set(ctx, "charpattern", r"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"); + + utf8.set_field(ctx, "codes", Callback::from_fn(&ctx, |ctx, _, mut stack| { + #[derive(Collect, Clone)] + #[collect(require_static)] + struct Codes { + s: String, + pos: usize, + } + + impl<'gc> Sequence<'gc> for Codes { + fn poll( + mut self: std::pin::Pin<&mut Self>, + ctx: Context<'gc>, + _exec: crate::Execution<'gc, '_>, + mut stack: crate::Stack<'gc, '_>, + ) -> Result, Error<'gc>> { + let position = self.pos; + let bytes = self.s.as_bytes(); + let len = bytes.len(); + + if position >= len { + stack.replace(ctx, Value::Nil); + return Ok(SequencePoll::Return); + } + + let byte = bytes[position]; + + let expected_bytes = utf8_sequence_length(ctx, byte, position)?; + + validate_utf8_sequence(ctx, position, expected_bytes, bytes)?; + + let code_point = decode_utf8_codepoint(position, expected_bytes, bytes); + + stack.clear(); + stack.into_back(ctx, position as i64 + 1); + stack.into_back(ctx, code_point as i64); + + self.pos += expected_bytes; + + Ok(SequencePoll::Return) + } + } + + let s = stack.consume::(ctx)?; + + let root = Codes { + s: s.to_owned(), + pos: 0 + }; + + let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| { + Ok(CallbackReturn::Sequence(BoxSequence::new(&ctx, root.clone()))) + }); + + stack.replace(ctx, codes); + + Ok(CallbackReturn::Return) + })); + + utf8.set_field(ctx, "len", Callback::from_fn(&ctx, |ctx, _, mut stack| { + let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; + let bytes = s.as_bytes(); + let len = bytes.len(); + + let i = i.unwrap_or(1); + let j = j.unwrap_or(-1); + + let start = adjust_index(i, len); + let end = adjust_index(j, len); + + let (start, end) = match calculate_string_range(start, end, len) { + Some(range) => range, + None => { + stack.replace(ctx, 0); + return Ok(CallbackReturn::Return); + } + }; + + let mut char_count = 0; + let mut position = start; + + while position < end { + let byte = bytes[position]; + + let expected_bytes = match utf8_sequence_length(ctx, byte, position) { + Ok(len) => len, + Err(_) => { + stack.clear(); + stack.push_back(Value::Boolean(false)); + stack.push_back(Value::Integer(position as i64 + 1)); + return Ok(CallbackReturn::Return); + } + }; + + if position + expected_bytes > end { + break; + } + + match validate_utf8_sequence(ctx, position, expected_bytes, bytes) { + Ok(_) => {}, + Err(_) => { + stack.clear(); + stack.push_back(Value::Boolean(false)); + stack.push_back(Value::Integer(position as i64 + 1)); + return Ok(CallbackReturn::Return); + } + } + + char_count += 1; + position += expected_bytes; + } + + stack.replace(ctx, char_count); + Ok(CallbackReturn::Return) + })); + + utf8.set_field(ctx, "codepoint", Callback::from_fn(&ctx, |ctx, _, mut stack| { + let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; + let bytes = s.as_bytes(); + let len = bytes.len(); + + let i = i.unwrap_or(1); + let j = j.unwrap_or(i); + + let start = adjust_index(i, len); + let end = adjust_index(j, len); + + if start >= len || end >= len || end < start { + return Ok(CallbackReturn::Return) + } + + let mut position = start; + let mut codepoints = Vec::new(); + + while position <= end { + if position >= len { + break; + } + + let byte = bytes[position]; + + let expected_bytes = utf8_sequence_length(ctx, byte, position)?; + + validate_utf8_sequence(ctx, position, expected_bytes, bytes)?; + + let code_point = decode_utf8_codepoint(position, expected_bytes, bytes); + + if position <= end { + codepoints.push(code_point as i64); + } + + position += expected_bytes; + } + + stack.clear(); + for codepoint in codepoints { + stack.push_back(Value::Integer(codepoint)); + } + + Ok(CallbackReturn::Return) + })); + + utf8.set_field(ctx, "offset", Callback::from_fn(&ctx, |ctx, _, mut stack| { + let (s, n, i): (String, i64, Option) = stack.consume(ctx)?; + let bytes = s.as_bytes(); + let len = bytes.len(); + + let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 }); + + let mut pos = adjust_index(i, len); + + if n == 0 { + if pos >= len { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); + } + + while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { + pos -= 1; + } + + stack.replace(ctx, (pos as i64) + 1); + return Ok(CallbackReturn::Return); + } + + if n > 0 { + let mut count = 0; + + while count < n && pos < len { + if (bytes[pos] & 0xC0) != 0x80 { + count += 1; + } + + if count == n { + break; + } + + pos += 1; + } + + if count == n - 1 && pos == len { + stack.replace(ctx, (pos as i64) + 1); + return Ok(CallbackReturn::Return); + } else if count < n { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); + } + } else if n < 0 { + let mut count = 0; + + if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { + while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { + pos -= 1; + } + if pos > 0 { + pos -= 1; + } + } else if pos > 0 { + pos -= 1; + } + + while count < (-n) && pos > 0 { + pos -= 1; + + while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { + pos -= 1; + } + + count += 1; + } + + if count < (-n) { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); + } + } + + stack.replace(ctx, (pos as i64) + 1); + Ok(CallbackReturn::Return) + })); + + ctx.set_global("utf8", utf8); +} \ No newline at end of file From b5643dba63eabd99fcb38f50a7e184ed755d4a74 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Fri, 2 May 2025 23:49:55 +0600 Subject: [PATCH 02/12] cargo fmt --- src/stdlib/utf8.rs | 543 ++++++++++++++++++++++++--------------------- 1 file changed, 295 insertions(+), 248 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index b2372081..3304a886 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -1,8 +1,15 @@ use gc_arena::Collect; -use crate::{BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll, Table, Value}; +use crate::{ + BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll, + Table, Value, +}; -fn utf8_sequence_length<'gc>(ctx: Context<'gc>, byte: u8, position: usize) -> Result> { +fn utf8_sequence_length<'gc>( + ctx: Context<'gc>, + byte: u8, + position: usize, +) -> Result> { if byte & 0x80 == 0 { Ok(1) } else if byte & 0xE0 == 0xC0 { @@ -12,46 +19,55 @@ fn utf8_sequence_length<'gc>(ctx: Context<'gc>, byte: u8, position: usize) -> Re } else if byte & 0xF8 == 0xF0 { Ok(4) } else { - Err(format!("invalid UTF-8 sequence at position {}", position + 1) - .into_value(ctx).into()) + Err( + format!("invalid UTF-8 sequence at position {}", position + 1) + .into_value(ctx) + .into(), + ) } } -fn validate_utf8_sequence<'gc>(ctx: Context<'gc>, position: usize, expected_bytes: usize, bytes: &[u8]) -> Result<(), Error<'gc>> { +fn validate_utf8_sequence<'gc>( + ctx: Context<'gc>, + position: usize, + expected_bytes: usize, + bytes: &[u8], +) -> Result<(), Error<'gc>> { if position + expected_bytes > bytes.len() { - return Err(format!("incomplete UTF-8 code at position {}", position + 1) - .into_value(ctx).into()); + return Err( + format!("incomplete UTF-8 code at position {}", position + 1) + .into_value(ctx) + .into(), + ); } - + for i in 1..expected_bytes { if bytes[position + i] & 0xC0 != 0x80 { return Err(format!("invalid UTF-8 code at position {}", position + 1) - .into_value(ctx).into()); + .into_value(ctx) + .into()); } } - + Ok(()) } fn decode_utf8_codepoint(position: usize, expected_bytes: usize, bytes: &[u8]) -> u32 { match expected_bytes { 1 => bytes[position] as u32, - 2 => { - ((bytes[position] & 0x1F) as u32) << 6 | - ((bytes[position + 1] & 0x3F) as u32) - } + 2 => ((bytes[position] & 0x1F) as u32) << 6 | ((bytes[position + 1] & 0x3F) as u32), 3 => { - ((bytes[position] & 0x0F) as u32) << 12 | - ((bytes[position + 1] & 0x3F) as u32) << 6 | - ((bytes[position + 2] & 0x3F) as u32) + ((bytes[position] & 0x0F) as u32) << 12 + | ((bytes[position + 1] & 0x3F) as u32) << 6 + | ((bytes[position + 2] & 0x3F) as u32) } 4 => { - ((bytes[position] & 0x07) as u32) << 18 | - ((bytes[position + 1] & 0x3F) as u32) << 12 | - ((bytes[position + 2] & 0x3F) as u32) << 6 | - ((bytes[position + 3] & 0x3F) as u32) + ((bytes[position] & 0x07) as u32) << 18 + | ((bytes[position + 1] & 0x3F) as u32) << 12 + | ((bytes[position + 2] & 0x3F) as u32) << 6 + | ((bytes[position + 3] & 0x3F) as u32) } - _ => unreachable!() // this should never happen!! + _ => unreachable!(), // this should never happen!! } } @@ -75,48 +91,63 @@ fn calculate_string_range(start: usize, end: usize, len: usize) -> Option<(usize pub fn load_utf8(ctx: Context) { let utf8 = Table::new(&ctx); - - utf8.set_field(ctx, "char", Callback::from_fn(&ctx, |ctx, _, mut stack| { - let mut bytes = Vec::with_capacity(stack.len() * 4); - let iter = stack.into_iter().enumerate(); - - for (idx, i) in iter { - let code = match i.to_integer() { - Some(code) => code as u32, - None => { - return Err(format!("bad argument #{} to 'char' (number expected, got {})", - idx + 1, i.type_name()).into_value(ctx).into()) + + utf8.set_field( + ctx, + "char", + Callback::from_fn(&ctx, |ctx, _, mut stack| { + let mut bytes = Vec::with_capacity(stack.len() * 4); + let iter = stack.into_iter().enumerate(); + + for (idx, i) in iter { + let code = match i.to_integer() { + Some(code) => code as u32, + None => { + return Err(format!( + "bad argument #{} to 'char' (number expected, got {})", + idx + 1, + i.type_name() + ) + .into_value(ctx) + .into()) + } + }; + + if let Some(c) = char::from_u32(code) { + let mut buf = [0; 4]; + let utf8_bytes = c.encode_utf8(&mut buf).as_bytes(); + bytes.extend_from_slice(utf8_bytes); + } else { + return Err( + format!("bad argument #{} to 'char' (value out of range)", idx + 1) + .into_value(ctx) + .into(), + ); } - }; - - if let Some(c) = char::from_u32(code) { - let mut buf = [0; 4]; - let utf8_bytes = c.encode_utf8(&mut buf).as_bytes(); - bytes.extend_from_slice(utf8_bytes); - } else { - return Err(format!("bad argument #{} to 'char' (value out of range)", - idx + 1).into_value(ctx).into()) } - } - - let result = ctx.intern(&bytes); - stack.replace(ctx, result); - - Ok(CallbackReturn::Return) - })); - + + let result = ctx.intern(&bytes); + stack.replace(ctx, result); + + Ok(CallbackReturn::Return) + }), + ); + let _ = utf8.set(ctx, "charpattern", r"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"); - utf8.set_field(ctx, "codes", Callback::from_fn(&ctx, |ctx, _, mut stack| { - #[derive(Collect, Clone)] - #[collect(require_static)] - struct Codes { - s: String, - pos: usize, - } + utf8.set_field( + ctx, + "codes", + Callback::from_fn(&ctx, |ctx, _, mut stack| { + #[derive(Collect, Clone)] + #[collect(require_static)] + struct Codes { + s: String, + pos: usize, + } - impl<'gc> Sequence<'gc> for Codes { - fn poll( + impl<'gc> Sequence<'gc> for Codes { + fn poll( mut self: std::pin::Pin<&mut Self>, ctx: Context<'gc>, _exec: crate::Execution<'gc, '_>, @@ -125,227 +156,243 @@ pub fn load_utf8(ctx: Context) { let position = self.pos; let bytes = self.s.as_bytes(); let len = bytes.len(); - + if position >= len { stack.replace(ctx, Value::Nil); return Ok(SequencePoll::Return); } - + let byte = bytes[position]; - + let expected_bytes = utf8_sequence_length(ctx, byte, position)?; - + validate_utf8_sequence(ctx, position, expected_bytes, bytes)?; - + let code_point = decode_utf8_codepoint(position, expected_bytes, bytes); - + stack.clear(); stack.into_back(ctx, position as i64 + 1); stack.into_back(ctx, code_point as i64); - + self.pos += expected_bytes; - + Ok(SequencePoll::Return) + } } - } - let s = stack.consume::(ctx)?; - - let root = Codes { - s: s.to_owned(), - pos: 0 - }; - - let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| { - Ok(CallbackReturn::Sequence(BoxSequence::new(&ctx, root.clone()))) - }); - - stack.replace(ctx, codes); - - Ok(CallbackReturn::Return) - })); - - utf8.set_field(ctx, "len", Callback::from_fn(&ctx, |ctx, _, mut stack| { - let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; - let bytes = s.as_bytes(); - let len = bytes.len(); - - let i = i.unwrap_or(1); - let j = j.unwrap_or(-1); - - let start = adjust_index(i, len); - let end = adjust_index(j, len); - - let (start, end) = match calculate_string_range(start, end, len) { - Some(range) => range, - None => { - stack.replace(ctx, 0); - return Ok(CallbackReturn::Return); - } - }; - - let mut char_count = 0; - let mut position = start; - - while position < end { - let byte = bytes[position]; - - let expected_bytes = match utf8_sequence_length(ctx, byte, position) { - Ok(len) => len, - Err(_) => { - stack.clear(); - stack.push_back(Value::Boolean(false)); - stack.push_back(Value::Integer(position as i64 + 1)); + let s = stack.consume::(ctx)?; + + let root = Codes { + s: s.to_owned(), + pos: 0, + }; + + let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| { + Ok(CallbackReturn::Sequence(BoxSequence::new( + &ctx, + root.clone(), + ))) + }); + + stack.replace(ctx, codes); + + Ok(CallbackReturn::Return) + }), + ); + + utf8.set_field( + ctx, + "len", + Callback::from_fn(&ctx, |ctx, _, mut stack| { + let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; + let bytes = s.as_bytes(); + let len = bytes.len(); + + let i = i.unwrap_or(1); + let j = j.unwrap_or(-1); + + let start = adjust_index(i, len); + let end = adjust_index(j, len); + + let (start, end) = match calculate_string_range(start, end, len) { + Some(range) => range, + None => { + stack.replace(ctx, 0); return Ok(CallbackReturn::Return); } }; - - if position + expected_bytes > end { - break; - } - - match validate_utf8_sequence(ctx, position, expected_bytes, bytes) { - Ok(_) => {}, - Err(_) => { - stack.clear(); - stack.push_back(Value::Boolean(false)); - stack.push_back(Value::Integer(position as i64 + 1)); - return Ok(CallbackReturn::Return); + + let mut char_count = 0; + let mut position = start; + + while position < end { + let byte = bytes[position]; + + let expected_bytes = match utf8_sequence_length(ctx, byte, position) { + Ok(len) => len, + Err(_) => { + stack.clear(); + stack.push_back(Value::Boolean(false)); + stack.push_back(Value::Integer(position as i64 + 1)); + return Ok(CallbackReturn::Return); + } + }; + + if position + expected_bytes > end { + break; } + + match validate_utf8_sequence(ctx, position, expected_bytes, bytes) { + Ok(_) => {} + Err(_) => { + stack.clear(); + stack.push_back(Value::Boolean(false)); + stack.push_back(Value::Integer(position as i64 + 1)); + return Ok(CallbackReturn::Return); + } + } + + char_count += 1; + position += expected_bytes; } - - char_count += 1; - position += expected_bytes; - } - - stack.replace(ctx, char_count); - Ok(CallbackReturn::Return) - })); - - utf8.set_field(ctx, "codepoint", Callback::from_fn(&ctx, |ctx, _, mut stack| { - let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; - let bytes = s.as_bytes(); - let len = bytes.len(); - - let i = i.unwrap_or(1); - let j = j.unwrap_or(i); - - let start = adjust_index(i, len); - let end = adjust_index(j, len); - - if start >= len || end >= len || end < start { - return Ok(CallbackReturn::Return) - } - - let mut position = start; - let mut codepoints = Vec::new(); - - while position <= end { - if position >= len { - break; - } - - let byte = bytes[position]; - - let expected_bytes = utf8_sequence_length(ctx, byte, position)?; - - validate_utf8_sequence(ctx, position, expected_bytes, bytes)?; - - let code_point = decode_utf8_codepoint(position, expected_bytes, bytes); - - if position <= end { - codepoints.push(code_point as i64); - } - - position += expected_bytes; - } - - stack.clear(); - for codepoint in codepoints { - stack.push_back(Value::Integer(codepoint)); - } - - Ok(CallbackReturn::Return) - })); - - utf8.set_field(ctx, "offset", Callback::from_fn(&ctx, |ctx, _, mut stack| { - let (s, n, i): (String, i64, Option) = stack.consume(ctx)?; - let bytes = s.as_bytes(); - let len = bytes.len(); - - let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 }); - - let mut pos = adjust_index(i, len); - - if n == 0 { - if pos >= len { - stack.replace(ctx, Value::Nil); + + stack.replace(ctx, char_count); + Ok(CallbackReturn::Return) + }), + ); + + utf8.set_field( + ctx, + "codepoint", + Callback::from_fn(&ctx, |ctx, _, mut stack| { + let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; + let bytes = s.as_bytes(); + let len = bytes.len(); + + let i = i.unwrap_or(1); + let j = j.unwrap_or(i); + + let start = adjust_index(i, len); + let end = adjust_index(j, len); + + if start >= len || end >= len || end < start { return Ok(CallbackReturn::Return); } - - while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { - pos -= 1; - } - - stack.replace(ctx, (pos as i64) + 1); - return Ok(CallbackReturn::Return); - } - - if n > 0 { - let mut count = 0; - - while count < n && pos < len { - if (bytes[pos] & 0xC0) != 0x80 { - count += 1; - } - - if count == n { + + let mut position = start; + let mut codepoints = Vec::new(); + + while position <= end { + if position >= len { break; } - - pos += 1; + + let byte = bytes[position]; + + let expected_bytes = utf8_sequence_length(ctx, byte, position)?; + + validate_utf8_sequence(ctx, position, expected_bytes, bytes)?; + + let code_point = decode_utf8_codepoint(position, expected_bytes, bytes); + + if position <= end { + codepoints.push(code_point as i64); + } + + position += expected_bytes; } - - if count == n - 1 && pos == len { + + stack.clear(); + for codepoint in codepoints { + stack.push_back(Value::Integer(codepoint)); + } + + Ok(CallbackReturn::Return) + }), + ); + + utf8.set_field( + ctx, + "offset", + Callback::from_fn(&ctx, |ctx, _, mut stack| { + let (s, n, i): (String, i64, Option) = stack.consume(ctx)?; + let bytes = s.as_bytes(); + let len = bytes.len(); + + let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 }); + + let mut pos = adjust_index(i, len); + + if n == 0 { + if pos >= len { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); + } + + while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { + pos -= 1; + } + stack.replace(ctx, (pos as i64) + 1); return Ok(CallbackReturn::Return); - } else if count < n { - stack.replace(ctx, Value::Nil); - return Ok(CallbackReturn::Return); } - } else if n < 0 { - let mut count = 0; - - if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { - while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { - pos -= 1; + + if n > 0 { + let mut count = 0; + + while count < n && pos < len { + if (bytes[pos] & 0xC0) != 0x80 { + count += 1; + } + + if count == n { + break; + } + + pos += 1; } - if pos > 0 { + + if count == n - 1 && pos == len { + stack.replace(ctx, (pos as i64) + 1); + return Ok(CallbackReturn::Return); + } else if count < n { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); + } + } else if n < 0 { + let mut count = 0; + + if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { + while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { + pos -= 1; + } + if pos > 0 { + pos -= 1; + } + } else if pos > 0 { pos -= 1; } - } else if pos > 0 { - pos -= 1; - } - - while count < (-n) && pos > 0 { - pos -= 1; - - while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { + + while count < (-n) && pos > 0 { pos -= 1; + + while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { + pos -= 1; + } + + count += 1; + } + + if count < (-n) { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); } - - count += 1; - } - - if count < (-n) { - stack.replace(ctx, Value::Nil); - return Ok(CallbackReturn::Return); } - } - - stack.replace(ctx, (pos as i64) + 1); - Ok(CallbackReturn::Return) - })); - + + stack.replace(ctx, (pos as i64) + 1); + Ok(CallbackReturn::Return) + }), + ); + ctx.set_global("utf8", utf8); -} \ No newline at end of file +} From d5447babbc1c353b8e8360db2e37bba3415a419e Mon Sep 17 00:00:00 2001 From: lyranowl Date: Sat, 3 May 2025 01:11:34 +0600 Subject: [PATCH 03/12] `feature`: added tests for all functions, fix some bugs --- src/lua.rs | 3 +- src/stdlib/utf8.rs | 162 +++++++++++++++---------- tests/scripts/utf8.lua | 260 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 364 insertions(+), 61 deletions(-) create mode 100644 tests/scripts/utf8.lua diff --git a/src/lua.rs b/src/lua.rs index 67f03007..ce270d87 100644 --- a/src/lua.rs +++ b/src/lua.rs @@ -9,7 +9,7 @@ use gc_arena::{ use crate::{ finalizers::Finalizers, stash::{Fetchable, Stashable}, - stdlib::{load_base, load_coroutine, load_io, load_math, load_string, load_table}, + stdlib::{load_base, load_coroutine, load_io, load_math, load_string, load_table, load_utf8}, string::InternedStringSet, thread::BadThreadMode, Error, ExternError, FromMultiValue, FromValue, Fuel, IntoValue, Registry, RuntimeError, @@ -176,6 +176,7 @@ impl Lua { load_math(ctx); load_string(ctx); load_table(ctx); + load_utf8(ctx); }) } diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 3304a886..cd85c04d 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -1,3 +1,5 @@ +use std::{rc::Rc, sync::atomic::AtomicUsize, sync::atomic::Ordering}; + use gc_arena::Collect; use crate::{ @@ -118,11 +120,12 @@ pub fn load_utf8(ctx: Context) { let utf8_bytes = c.encode_utf8(&mut buf).as_bytes(); bytes.extend_from_slice(utf8_bytes); } else { - return Err( - format!("bad argument #{} to 'char' (value out of range)", idx + 1) - .into_value(ctx) - .into(), - ); + return Err(format!( + "bad argument #{} to 'char' (value out of range)", + idx + 1 + ) + .into_value(ctx) + .into()); } } @@ -143,38 +146,48 @@ pub fn load_utf8(ctx: Context) { #[collect(require_static)] struct Codes { s: String, - pos: usize, + pos: Rc, } impl<'gc> Sequence<'gc> for Codes { fn poll( - mut self: std::pin::Pin<&mut Self>, + self: std::pin::Pin<&mut Self>, ctx: Context<'gc>, _exec: crate::Execution<'gc, '_>, mut stack: crate::Stack<'gc, '_>, ) -> Result, Error<'gc>> { - let position = self.pos; + let position = Rc::clone(&self.pos); let bytes = self.s.as_bytes(); let len = bytes.len(); - if position >= len { + if position.load(Ordering::Relaxed) >= len { stack.replace(ctx, Value::Nil); return Ok(SequencePoll::Return); } - let byte = bytes[position]; + let byte = bytes[position.load(Ordering::Relaxed)]; - let expected_bytes = utf8_sequence_length(ctx, byte, position)?; + let expected_bytes = + utf8_sequence_length(ctx, byte, position.load(Ordering::Relaxed))?; - validate_utf8_sequence(ctx, position, expected_bytes, bytes)?; + validate_utf8_sequence( + ctx, + position.load(Ordering::Relaxed), + expected_bytes, + bytes, + )?; - let code_point = decode_utf8_codepoint(position, expected_bytes, bytes); + let code_point = decode_utf8_codepoint( + position.load(Ordering::Relaxed), + expected_bytes, + bytes, + ); stack.clear(); - stack.into_back(ctx, position as i64 + 1); + stack.into_back(ctx, position.load(Ordering::Relaxed) as i64 + 1); stack.into_back(ctx, code_point as i64); - self.pos += expected_bytes; + self.pos.fetch_add(expected_bytes, Ordering::Relaxed); Ok(SequencePoll::Return) } @@ -184,7 +197,7 @@ pub fn load_utf8(ctx: Context) { let root = Codes { s: s.to_owned(), - pos: 0, + pos: Rc::new(AtomicUsize::new(0)), }; let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| { @@ -225,29 +238,29 @@ pub fn load_utf8(ctx: Context) { let mut char_count = 0; let mut position = start; - while position < end { + while position <= end { + if position >= len { + break; + } + let byte = bytes[position]; let expected_bytes = match utf8_sequence_length(ctx, byte, position) { Ok(len) => len, Err(_) => { stack.clear(); - stack.push_back(Value::Boolean(false)); - stack.push_back(Value::Integer(position as i64 + 1)); + stack.into_back(ctx, Value::Nil); + stack.into_back(ctx, position as i64 + 1); return Ok(CallbackReturn::Return); } }; - if position + expected_bytes > end { - break; - } - match validate_utf8_sequence(ctx, position, expected_bytes, bytes) { Ok(_) => {} Err(_) => { stack.clear(); - stack.push_back(Value::Boolean(false)); - stack.push_back(Value::Integer(position as i64 + 1)); + stack.into_back(ctx, Value::Nil); + stack.into_back(ctx, position as i64 + 1); return Ok(CallbackReturn::Return); } } @@ -265,6 +278,16 @@ pub fn load_utf8(ctx: Context) { ctx, "codepoint", Callback::from_fn(&ctx, |ctx, _, mut stack| { + fn is_valid_lua_index(index: i64, length: i64) -> bool { + if index == 0 { + false + } else if index > 0 { + index <= length + } else { + index >= -length + } + } + let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; let bytes = s.as_bytes(); let len = bytes.len(); @@ -272,10 +295,23 @@ pub fn load_utf8(ctx: Context) { let i = i.unwrap_or(1); let j = j.unwrap_or(i); + if !is_valid_lua_index(j, len as i64) { + return Err("bad argument #3 to 'codepoint' (out of bounds)" + .into_value(ctx) + .into()); + } + + if !is_valid_lua_index(i, len as i64) { + return Err(format!("bad argument #2 to 'codepoint' (out of bounds)",) + .into_value(ctx) + .into()); + } + let start = adjust_index(i, len); let end = adjust_index(j, len); - if start >= len || end >= len || end < start { + if start >= len || end < start { + // Return empty result if normalized range is invalid return Ok(CallbackReturn::Return); } @@ -321,27 +357,39 @@ pub fn load_utf8(ctx: Context) { let i = i.unwrap_or(if n >= 0 { 1 } else { len as i64 + 1 }); - let mut pos = adjust_index(i, len); + if i == 0 { + return Err("bad argument #3 to 'offset' (position out of bounds)" + .into_value(ctx) + .into()); + } + + let mut position = adjust_index(i, len); + + if n != 0 && position < len && (bytes[position] & 0xC0) == 0x80 { + return Err("initial position is a continuation byte" + .into_value(ctx) + .into()); + } if n == 0 { - if pos >= len { + if position >= len { stack.replace(ctx, Value::Nil); return Ok(CallbackReturn::Return); } - while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { - pos -= 1; + while position > 0 && (bytes[position] & 0xC0) == 0x80 { + position -= 1; } - stack.replace(ctx, (pos as i64) + 1); + stack.replace(ctx, (position as i64) + 1); return Ok(CallbackReturn::Return); } if n > 0 { let mut count = 0; - while count < n && pos < len { - if (bytes[pos] & 0xC0) != 0x80 { + while count < n && position < len { + if (bytes[position] & 0xC0) != 0x80 { count += 1; } @@ -349,47 +397,41 @@ pub fn load_utf8(ctx: Context) { break; } - pos += 1; + position += 1; + } + + if count == n { + stack.replace(ctx, (position as i64) + 1); + return Ok(CallbackReturn::Return); } - if count == n - 1 && pos == len { - stack.replace(ctx, (pos as i64) + 1); + if count == n - 1 && position == len { + stack.replace(ctx, (position as i64) + 1); return Ok(CallbackReturn::Return); } else if count < n { stack.replace(ctx, Value::Nil); return Ok(CallbackReturn::Return); } } else if n < 0 { - let mut count = 0; - - if pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { - while pos > 0 && (bytes[pos - 1] & 0xC0) == 0x80 { - pos -= 1; - } - if pos > 0 { - pos -= 1; - } - } else if pos > 0 { - pos -= 1; - } + let target_count = -n; + let mut count = 0i64; - while count < (-n) && pos > 0 { - pos -= 1; + let mut current_byte_index = adjust_index(i, len); - while pos > 0 && (bytes[pos] & 0xC0) == 0x80 { - pos -= 1; + while count < target_count { + if current_byte_index == 0 { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); + } + current_byte_index -= 1; + if (bytes[current_byte_index] & 0xC0) != 0x80 { + count += 1; } - - count += 1; - } - - if count < (-n) { - stack.replace(ctx, Value::Nil); - return Ok(CallbackReturn::Return); } + stack.replace(ctx, (current_byte_index as i64) + 1); + return Ok(CallbackReturn::Return); } - stack.replace(ctx, (pos as i64) + 1); Ok(CallbackReturn::Return) }), ); diff --git a/tests/scripts/utf8.lua b/tests/scripts/utf8.lua new file mode 100644 index 00000000..ef7319be --- /dev/null +++ b/tests/scripts/utf8.lua @@ -0,0 +1,260 @@ +function is_err(f, ...) + local status, err = pcall(f, ...) + return not status, err +end + +function collect_codes(s) + local results = {} + local err_status, err_val = pcall(function() + for p, c in utf8.codes(s) do + table.insert(results, {p, c}) + end + end) + if not err_status then + return false, err_val + end + return results +end + +function collect_codepoints(s, i, j) + local results = {} + local args = {s, i, j} + local err_status, err_val = pcall(function() + local values = {utf8.codepoint(table.unpack(args))} + for _, v in ipairs(values) do + table.insert(results, v) + end + end) + if not err_status then + return false, err_val + end + return results +end + +do + assert(utf8.char() == "") + assert(utf8.char(65) == "A") + assert(utf8.char(65, 66, 67) == "ABC") + assert(utf8.char(0x41, 0x42, 0x43) == "ABC") + assert(utf8.char(1055, 1088, 1080, 1074, 1077, 1090) == "Привет") + assert(utf8.char(72, 1080, 33) == "Hи!") + assert(utf8.char(0xC2, 0xA2) == "\195\130\194\162") + assert(utf8.char(162) == "\194\162") + assert(utf8.char(0xE2, 0x82, 0xAC) == "\195\162\194\130\194\172") + assert(utf8.char(8364) == "\226\130\172") + assert(utf8.char(0xF0, 0x9F, 0x98, 0x80) == "\195\176\194\159\194\152\194\128") + assert(utf8.char(128512) == "\240\159\152\128") + assert(utf8.char(0) == "\0") + assert(utf8.char(65, 0, 66) == "A\0B") + assert(utf8.char(0x7F) == "\127") + assert(utf8.char(0x80) == "\194\128") + assert(utf8.char(0x7FF) == "\223\191") + assert(utf8.char(0x800) == "\224\160\128") + assert(utf8.char(0xFFFF) == "\239\191\191") + assert(utf8.char(0x10000) == "\240\144\128\128") + assert(utf8.char(0x10FFFF) == "\244\143\191\191") + assert(is_err(utf8.char, "A")) + assert(is_err(utf8.char, 65, "B")) + assert(is_err(utf8.char, {})) + assert(is_err(utf8.char, nil)) + assert(is_err(utf8.char, -1)) + assert(is_err(utf8.char, 0x110000)) + assert(is_err(utf8.char, 0xD800)) + assert(is_err(utf8.char, 0xDFFF)) + assert(is_err(utf8.char, 0x110000)) + assert(is_err(utf8.char, "not a number")) +end + +do + assert(utf8.charpattern == "[\\0-\\x7F\\xC2-\\xF4][\\x80-\\xBF]*") +end + +do + local empty_codes = collect_codes("") + assert(type(empty_codes) == "table" and #empty_codes == 0) + + local abc_codes = collect_codes("ABC") + assert(type(abc_codes) == "table" and #abc_codes == 3) + assert(abc_codes[1][1] == 1 and abc_codes[1][2] == 65) + assert(abc_codes[2][1] == 2 and abc_codes[2][2] == 66) + assert(abc_codes[3][1] == 3 and abc_codes[3][2] == 67) + + local ab0c_codes = collect_codes("AB\0C") + assert(type(ab0c_codes) == "table" and #ab0c_codes == 4) + assert(ab0c_codes[1][1] == 1 and ab0c_codes[1][2] == 65) + assert(ab0c_codes[2][1] == 2 and ab0c_codes[2][2] == 66) + assert(ab0c_codes[3][1] == 3 and ab0c_codes[3][2] == 0) + assert(ab0c_codes[4][1] == 4 and ab0c_codes[4][2] == 67) + + local privet = "Привет" + local privet_codes = collect_codes(privet) + assert(#privet_codes == 6) + assert(privet_codes[1][1] == 1 and privet_codes[1][2] == 1055) + assert(privet_codes[2][1] == 3 and privet_codes[2][2] == 1088) + assert(privet_codes[3][1] == 5 and privet_codes[3][2] == 1080) + assert(privet_codes[4][1] == 7 and privet_codes[4][2] == 1074) + assert(privet_codes[5][1] == 9 and privet_codes[5][2] == 1077) + assert(privet_codes[6][1] == 11 and privet_codes[6][2] == 1090) + + local hieuro = "Hi€!" + local hieuro_codes = collect_codes(hieuro) + assert(#hieuro_codes == 4) + assert(hieuro_codes[1][1] == 1 and hieuro_codes[1][2] == 72) + assert(hieuro_codes[2][1] == 2 and hieuro_codes[2][2] == 105) + assert(hieuro_codes[3][1] == 3 and hieuro_codes[3][2] == 8364) + assert(hieuro_codes[4][1] == 6 and hieuro_codes[4][2] == 33) + + local emoji = "😀" + local emoji_codes = collect_codes(emoji) + assert(#emoji_codes == 1) + assert(emoji_codes[1][1] == 1 and emoji_codes[1][2] == 128512) + + assert(collect_codes("abc\xE2\x82") == false) + assert(collect_codes("abc\xE2\x82\xFF") == false) + assert(collect_codes("abc\xFF") == false) + assert(collect_codes("\xC0\x80") == false) +end + +do + local s = "ABC" + assert(table.concat(collect_codepoints(s), ",") == "65") + assert(table.concat(collect_codepoints(s, 1), ",") == "65") + assert(table.concat(collect_codepoints(s, 2), ",") == "66") + assert(table.concat(collect_codepoints(s, 3), ",") == "67") + assert(collect_codepoints(s, 4) == false) + assert(table.concat(collect_codepoints(s, 1, 1), ",") == "65") + assert(table.concat(collect_codepoints(s, 1, 2), ",") == "65,66") + assert(table.concat(collect_codepoints(s, 1, 3), ",") == "65,66,67") + assert(table.concat(collect_codepoints(s, 2, 3), ",") == "66,67") + assert(table.concat(collect_codepoints(s, 3, 3), ",") == "67") + assert(collect_codepoints(s, 1, 10) == false) + assert(table.concat(collect_codepoints(s, 3, 1), ",") == "") + assert(table.concat(collect_codepoints(s, -1), ",") == "67") + assert(table.concat(collect_codepoints(s, -2), ",") == "66") + assert(table.concat(collect_codepoints(s, -3), ",") == "65") + assert(table.concat(collect_codepoints(s, -3, -1), ",") == "65,66,67") + assert(table.concat(collect_codepoints(s, -2, -1), ",") == "66,67") + assert(table.concat(collect_codepoints(s, -1, -1), ",") == "67") + assert(table.concat(collect_codepoints(s, 1, -1), ",") == "65,66,67") + assert(table.concat(collect_codepoints(s, 2, -1), ",") == "66,67") + assert(table.concat(collect_codepoints(s, 1, -2), ",") == "65,66") + assert(table.concat(collect_codepoints(s, -3, 3), ",") == "65,66,67") + assert(table.concat(collect_codepoints(s, -3, 1), ",") == "65") + + local privet = "Привет" + assert(table.concat(collect_codepoints(privet, 1), ",") == "1055") + assert(table.concat(collect_codepoints(privet, 1), ",") == "1055") + assert(table.concat(collect_codepoints(privet, 3), ",") == "1088") + assert(table.concat(collect_codepoints(privet, 1, 2), ",") == "1055") + assert(table.concat(collect_codepoints(privet, 1, 3), ",") == "1055,1088") + assert(table.concat(collect_codepoints(privet, 1, 4), ",") == "1055,1088") + assert(table.concat(collect_codepoints(privet, 1, 12), ",") == "1055,1088,1080,1074,1077,1090") + assert(table.concat(collect_codepoints(privet, 3, 7), ",") == "1088,1080,1074") + assert(table.concat(collect_codepoints(privet, -2, -1), ",") == "1090") + assert(table.concat(collect_codepoints(privet, 11, -1), ",") == "1090") + assert(table.concat(collect_codepoints(privet, 1, -1), ",") == "1055,1088,1080,1074,1077,1090") + assert(collect_codepoints("", 1, 1) == false) + + local emoji = "😀" + assert(table.concat(collect_codepoints(emoji), ",") == "128512") + assert(table.concat(collect_codepoints(emoji, 1), ",") == "128512") + assert(table.concat(collect_codepoints(emoji, 1), ",") == "128512") + assert(table.concat(collect_codepoints(emoji, 1, 4), ",") == "128512") + assert(collect_codepoints("abc\xE2\x82", 1) == false) + assert(collect_codepoints("abc\xE2\x82\xFF", 1) == false) + assert(collect_codepoints("abc\xFF", 1) == false) + assert(collect_codepoints("abc\xFF", 4) == false) + assert(collect_codepoints("abc\xE2\x82", 1, 5) == false) + assert(collect_codepoints("abc\xE2\x20\xAC", 1, 6) == false) +end + +do + assert(utf8.len("") == 0) + assert(utf8.len("ABC") == 3) + assert(utf8.len("При") == 3) + assert(utf8.len("Привет") == 6) + assert(utf8.len("😀") == 1) + assert(utf8.len("A😀B") == 3) + assert(utf8.len("A\0B") == 3) + + local s = "Привет" + assert(utf8.len(s, 1, 1) == 1) + assert(utf8.len(s, 1, 2) == 1) + assert(utf8.len(s, 1, 3) == 2) + assert(utf8.len(s, 1, 4) == 2) + assert(utf8.len(s, 3, 4) == 1) + assert(utf8.len(s, 3, 6) == 2) + assert(utf8.len(s, 1, 12) == 6) + assert(utf8.len(s, 1, -1) == 6) + assert(utf8.len(s, -12, -1) == 6) + assert(utf8.len(s, -2, -1) == 1) + assert(utf8.len(s, 11, 12) == 1) + assert(utf8.len(s, 1, 6) == 3) + assert(utf8.len(s, 7, 12) == 3) + assert(utf8.len(s, 13, 20) == 0) + assert(utf8.len(s, 5, 1) == 0) + assert(utf8.len(s, 1, 11) == 6) +end + +do + local s = "Привет" + assert(utf8.offset(s, 0) == 1) + assert(utf8.offset(s, 1) == 1) + assert(utf8.offset(s, 2) == 3) + assert(utf8.offset(s, 6) == 11) + assert(utf8.offset(s, 7) == 13) + assert(utf8.offset(s, 8) == nil) + assert(utf8.offset(s, -1) == 11) + assert(utf8.offset(s, -2) == 9) + assert(utf8.offset(s, -6) == 1) + assert(utf8.offset(s, -7) == nil) + assert(utf8.offset(s, 1, 1) == 1) + assert(is_err(utf8.offset, s, 1, 2)) + assert(utf8.offset(s, 1, 3) == 3) + assert(utf8.offset(s, 2, 3) == 5) + assert(utf8.offset(s, 1, 11) == 11) + assert(is_err(utf8.offset, s, 1, 12)) + assert(utf8.offset(s, 1, 13) == 13) + assert(utf8.offset(s, 2, 11) == 13) + assert(is_err(utf8.offset, s, 2, 12)) + assert(is_err(utf8.offset, s, -1, 12)) + assert(utf8.offset(s, -1, 11) == 9) + assert(utf8.offset(s, -1, 3) == 1) + assert(is_err(utf8.offset, s, -1, 2)) + assert(utf8.offset(s, -1, 1) == nil) + assert(is_err(utf8.offset, s, -2, 12)) + assert(is_err(utf8.offset, s, -6, 12)) + assert(is_err(utf8.offset, s, -7, 12)) + assert(utf8.offset(s, -1, #s + 1) == 11) + assert(utf8.offset(s, 0, 1) == 1) + assert(utf8.offset(s, 0, 2) == 1) + assert(utf8.offset(s, 0, 3) == 3) + assert(utf8.offset(s, 0, 4) == 3) + assert(utf8.offset(s, 0, 11) == 11) + assert(utf8.offset(s, 0, 12) == 11) + assert(utf8.offset(s, 0, 13) == nil) + assert(is_err(utf8.offset, s, 0, 0)) + assert(utf8.offset(s, 0, -1) == 11) + assert(utf8.offset(s, 0, -12) == 1) + + local ascii = "ABCDEFG" + assert(utf8.offset(ascii, 3, 1) == 3) + assert(utf8.offset(ascii, -3, 7) == 4) + assert(utf8.offset(ascii, 0, 5) == 5) + + local emoji = "A😀B" + assert(utf8.offset(emoji, 1) == 1) + assert(utf8.offset(emoji, 2) == 2) + assert(utf8.offset(emoji, 3) == 6) + assert(utf8.offset(emoji, 4) == 7) + assert(utf8.offset(emoji, -1) == 6) + assert(utf8.offset(emoji, -2) == 2) + assert(utf8.offset(emoji, -3) == 1) + assert(utf8.offset(emoji, 0, 1) == 1) + assert(utf8.offset(emoji, 0, 2) == 2) + assert(utf8.offset(emoji, 0, 3) == 2) + assert(utf8.offset(emoji, 0, 4) == 2) + assert(utf8.offset(emoji, 0, 5) == 2) + assert(utf8.offset(emoji, 0, 6) == 6) + assert(utf8.offset(emoji, 0, 7) == nil) +end From 2c0320c1db3fcc334458865cba5d8df393eba9f6 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Mon, 2 Jun 2025 17:49:51 +0500 Subject: [PATCH 04/12] `refactor`: simplify `utf8.codes` --- src/stdlib/utf8.rs | 91 +++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 61 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index cd85c04d..5e96536b 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -1,10 +1,5 @@ -use std::{rc::Rc, sync::atomic::AtomicUsize, sync::atomic::Ordering}; - -use gc_arena::Collect; - use crate::{ - BoxSequence, Callback, CallbackReturn, Context, Error, IntoValue, Sequence, SequencePoll, - Table, Value, + Callback, CallbackReturn, Context, Error, IntoValue, String as LuaString, Table, Value, }; fn utf8_sequence_length<'gc>( @@ -142,72 +137,46 @@ pub fn load_utf8(ctx: Context) { ctx, "codes", Callback::from_fn(&ctx, |ctx, _, mut stack| { - #[derive(Collect, Clone)] - #[collect(require_static)] - struct Codes { - s: String, - pos: Rc, - } + let s = stack.consume::(ctx)?; - impl<'gc> Sequence<'gc> for Codes { - fn poll( - self: std::pin::Pin<&mut Self>, - ctx: Context<'gc>, - _exec: crate::Execution<'gc, '_>, - mut stack: crate::Stack<'gc, '_>, - ) -> Result, Error<'gc>> { - let position = Rc::clone(&self.pos); - let bytes = self.s.as_bytes(); - let len = bytes.len(); - - if position.load(Ordering::Relaxed) >= len { - stack.replace(ctx, Value::Nil); - return Ok(SequencePoll::Return); - } + let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| { + let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?; - let byte = bytes[position.load(Ordering::Relaxed)]; + let s = s.to_str()?; + let n = (n - 1) as usize; - let expected_bytes = - utf8_sequence_length(ctx, byte, position.load(Ordering::Relaxed))?; + if n >= s.len() { + stack.replace(ctx, (Value::Nil, Value::Nil)); + return Ok(CallbackReturn::Return); + } - validate_utf8_sequence( - ctx, - position.load(Ordering::Relaxed), - expected_bytes, - bytes, - )?; + let bytes = &s.as_bytes()[n..]; - let code_point = decode_utf8_codepoint( - position.load(Ordering::Relaxed), - expected_bytes, - bytes, - ); + let mut chunks = bytes.utf8_chunks(); - stack.clear(); - stack.into_back(ctx, position.load(Ordering::Relaxed) as i64 + 1); - stack.into_back(ctx, code_point as i64); + if let Some(chunk) = chunks.next() { + if !chunk.invalid().is_empty() { + return Err("Invalid UTF-8 byte sequence".into_value(ctx).into()); + } - self.pos.fetch_add(expected_bytes, Ordering::Relaxed); + if let Some(c) = chunk.valid().chars().next() { + let len = c.len_utf8(); + let p = n + len; + let p = (p + 1) as i64; - Ok(SequencePoll::Return) + stack.replace(ctx, (p, c as i64)); + Ok(CallbackReturn::Return) + } else { + stack.replace(ctx, (Value::Nil, Value::Nil)); + Ok(CallbackReturn::Return) + } + } else { + stack.replace(ctx, (Value::Nil, Value::Nil)); + Ok(CallbackReturn::Return) } - } - - let s = stack.consume::(ctx)?; - - let root = Codes { - s: s.to_owned(), - pos: Rc::new(AtomicUsize::new(0)), - }; - - let codes = Callback::from_fn_with(&ctx, root, |root, ctx, _, _| { - Ok(CallbackReturn::Sequence(BoxSequence::new( - &ctx, - root.clone(), - ))) }); - stack.replace(ctx, codes); + stack.replace(ctx, (callback, s, 1)); Ok(CallbackReturn::Return) }), From eccc6fa524c9a2c82e6b7f15b4e7e8ce180c4813 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Mon, 2 Jun 2025 20:39:54 +0500 Subject: [PATCH 05/12] `fix`: index conversion --- src/stdlib/utf8.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 5e96536b..24ff666c 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -143,7 +143,7 @@ pub fn load_utf8(ctx: Context) { let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?; let s = s.to_str()?; - let n = (n - 1) as usize; + let n = adjust_index(n, s.len()); if n >= s.len() { stack.replace(ctx, (Value::Nil, Value::Nil)); From e7439757d5d8ce8808de3403431e9ccd82a947ff Mon Sep 17 00:00:00 2001 From: lyranowl Date: Mon, 2 Jun 2025 22:57:44 +0500 Subject: [PATCH 06/12] `fix`: try to fix indexing bug --- src/stdlib/utf8.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 24ff666c..3c0dcccc 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -139,11 +139,17 @@ pub fn load_utf8(ctx: Context) { Callback::from_fn(&ctx, |ctx, _, mut stack| { let s = stack.consume::(ctx)?; - let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| { + let callback = Callback::from_fn_with(&ctx, None, |first_call, ctx, _, mut stack| { let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?; + if n == 0 { + stack.into_back(ctx, 1); + } else { + stack.into_back(ctx, n + 1); + } + let s = s.to_str()?; - let n = adjust_index(n, s.len()); + let n = n as usize; if n >= s.len() { stack.replace(ctx, (Value::Nil, Value::Nil)); @@ -160,11 +166,13 @@ pub fn load_utf8(ctx: Context) { } if let Some(c) = chunk.valid().chars().next() { - let len = c.len_utf8(); - let p = n + len; - let p = (p + 1) as i64; - - stack.replace(ctx, (p, c as i64)); + if c.is_ascii() { + stack.into_back(ctx, c as i64); + } else { + let len = c.len_utf8(); + let n = stack.consume::(ctx)?; + stack.replace(ctx, (n + len as i64, c as i64)); + } Ok(CallbackReturn::Return) } else { stack.replace(ctx, (Value::Nil, Value::Nil)); @@ -176,7 +184,7 @@ pub fn load_utf8(ctx: Context) { } }); - stack.replace(ctx, (callback, s, 1)); + stack.replace(ctx, (callback, s, 0)); Ok(CallbackReturn::Return) }), From b23cbb52268a6c586dd59e09e60d7ce757fc3065 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Mon, 2 Jun 2025 22:59:40 +0500 Subject: [PATCH 07/12] `fix`: compile error --- src/stdlib/utf8.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 3c0dcccc..4ec21e8f 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -139,7 +139,7 @@ pub fn load_utf8(ctx: Context) { Callback::from_fn(&ctx, |ctx, _, mut stack| { let s = stack.consume::(ctx)?; - let callback = Callback::from_fn_with(&ctx, None, |first_call, ctx, _, mut stack| { + let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| { let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?; if n == 0 { From b214a11eef506d4b3eec9b248529f86cb1a6907e Mon Sep 17 00:00:00 2001 From: lyranowl Date: Mon, 2 Jun 2025 23:06:46 +0500 Subject: [PATCH 08/12] `fix`: indexing bug --- src/stdlib/utf8.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 4ec21e8f..975d05a4 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -166,12 +166,15 @@ pub fn load_utf8(ctx: Context) { } if let Some(c) = chunk.valid().chars().next() { - if c.is_ascii() { - stack.into_back(ctx, c as i64); + if n == 0 { + stack.replace(ctx, (1, c as i64)); } else { - let len = c.len_utf8(); - let n = stack.consume::(ctx)?; - stack.replace(ctx, (n + len as i64, c as i64)); + if c.is_ascii() { + stack.replace(ctx, (n as i64 + 1, c as i64)); + } else { + let len = c.len_utf8(); + stack.replace(ctx, ((n + len) as i64, c as i64)); + } } Ok(CallbackReturn::Return) } else { From e01396d7a9f4e463ced95221771a008c33276543 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Mon, 2 Jun 2025 23:08:05 +0500 Subject: [PATCH 09/12] `fix`: remove useless if branch --- src/stdlib/utf8.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 975d05a4..28fa5ccd 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -142,12 +142,6 @@ pub fn load_utf8(ctx: Context) { let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| { let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?; - if n == 0 { - stack.into_back(ctx, 1); - } else { - stack.into_back(ctx, n + 1); - } - let s = s.to_str()?; let n = n as usize; From a787394af6ccfdf2e791d443fe629014673189fe Mon Sep 17 00:00:00 2001 From: lyranowl Date: Tue, 3 Jun 2025 21:09:37 +0500 Subject: [PATCH 10/12] `refactor`: simplify `utf8.len` --- src/stdlib/utf8.rs | 73 +++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 52 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 28fa5ccd..50d986d8 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -78,14 +78,6 @@ fn adjust_index(index: i64, len: usize) -> usize { } } -fn calculate_string_range(start: usize, end: usize, len: usize) -> Option<(usize, usize)> { - if start >= len || (end < start && end != 0) { - None - } else { - Some((start, end.min(len))) - } -} - pub fn load_utf8(ctx: Context) { let utf8 = Table::new(&ctx); @@ -142,15 +134,15 @@ pub fn load_utf8(ctx: Context) { let callback = Callback::from_fn(&ctx, |ctx, _, mut stack| { let (s, n) = stack.consume::<(LuaString, i64)>(ctx)?; - let s = s.to_str()?; + let bytes = s.as_bytes(); let n = n as usize; - if n >= s.len() { + if n >= bytes.len() { stack.replace(ctx, (Value::Nil, Value::Nil)); return Ok(CallbackReturn::Return); } - let bytes = &s.as_bytes()[n..]; + let bytes = &bytes[n..]; let mut chunks = bytes.utf8_chunks(); @@ -192,58 +184,35 @@ pub fn load_utf8(ctx: Context) { "len", Callback::from_fn(&ctx, |ctx, _, mut stack| { let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; - let bytes = s.as_bytes(); - let len = bytes.len(); - - let i = i.unwrap_or(1); - let j = j.unwrap_or(-1); - let start = adjust_index(i, len); - let end = adjust_index(j, len); - - let (start, end) = match calculate_string_range(start, end, len) { - Some(range) => range, - None => { - stack.replace(ctx, 0); + let s = match std::str::from_utf8(s.as_bytes()) { + Ok(s) => s, + Err(err) => { + let position = err.error_len().unwrap_or_default(); + stack.replace(ctx, (false, position as i64 + 1)); return Ok(CallbackReturn::Return); } }; - let mut char_count = 0; - let mut position = start; + let len = s.len(); - while position <= end { - if position >= len { - break; - } + let i = i.unwrap_or(1); + let j = j.unwrap_or(-1); - let byte = bytes[position]; + let start = adjust_index(i, len); + let end = adjust_index(j, len); - let expected_bytes = match utf8_sequence_length(ctx, byte, position) { - Ok(len) => len, - Err(_) => { - stack.clear(); - stack.into_back(ctx, Value::Nil); - stack.into_back(ctx, position as i64 + 1); - return Ok(CallbackReturn::Return); - } - }; + if start >= len || (end < start && end != 0) { + stack.replace(ctx, 0); + return Ok(CallbackReturn::Return); + } - match validate_utf8_sequence(ctx, position, expected_bytes, bytes) { - Ok(_) => {} - Err(_) => { - stack.clear(); - stack.into_back(ctx, Value::Nil); - stack.into_back(ctx, position as i64 + 1); - return Ok(CallbackReturn::Return); - } - } + let end = end.min(len); - char_count += 1; - position += expected_bytes; - } + let s = &s[start..=end]; + + stack.replace(ctx, s.chars().count() as i64); - stack.replace(ctx, char_count); Ok(CallbackReturn::Return) }), ); From 0b5b72cde512d8a58fa7b63744961e7d19e00792 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Tue, 3 Jun 2025 22:07:43 +0500 Subject: [PATCH 11/12] `refactor`: simplify `utf8.codepoint` --- src/stdlib/utf8.rs | 179 +++++++++++---------------------------------- 1 file changed, 42 insertions(+), 137 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 50d986d8..9eb9e77b 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -1,81 +1,20 @@ -use crate::{ - Callback, CallbackReturn, Context, Error, IntoValue, String as LuaString, Table, Value, -}; - -fn utf8_sequence_length<'gc>( - ctx: Context<'gc>, - byte: u8, - position: usize, -) -> Result> { - if byte & 0x80 == 0 { - Ok(1) - } else if byte & 0xE0 == 0xC0 { - Ok(2) - } else if byte & 0xF0 == 0xE0 { - Ok(3) - } else if byte & 0xF8 == 0xF0 { - Ok(4) - } else { - Err( - format!("invalid UTF-8 sequence at position {}", position + 1) - .into_value(ctx) - .into(), - ) - } +use crate::{Callback, CallbackReturn, Context, IntoValue, String as LuaString, Table, Value}; + +fn convert_index(i: i64, len: usize) -> Option { + let val = match i { + 0 => 0, + v @ 1.. => v - 1, + v @ ..=-1 => (len as i64 + v).max(0), + }; + usize::try_from(val).ok() } -fn validate_utf8_sequence<'gc>( - ctx: Context<'gc>, - position: usize, - expected_bytes: usize, - bytes: &[u8], -) -> Result<(), Error<'gc>> { - if position + expected_bytes > bytes.len() { - return Err( - format!("incomplete UTF-8 code at position {}", position + 1) - .into_value(ctx) - .into(), - ); - } - - for i in 1..expected_bytes { - if bytes[position + i] & 0xC0 != 0x80 { - return Err(format!("invalid UTF-8 code at position {}", position + 1) - .into_value(ctx) - .into()); - } - } - - Ok(()) -} - -fn decode_utf8_codepoint(position: usize, expected_bytes: usize, bytes: &[u8]) -> u32 { - match expected_bytes { - 1 => bytes[position] as u32, - 2 => ((bytes[position] & 0x1F) as u32) << 6 | ((bytes[position + 1] & 0x3F) as u32), - 3 => { - ((bytes[position] & 0x0F) as u32) << 12 - | ((bytes[position + 1] & 0x3F) as u32) << 6 - | ((bytes[position + 2] & 0x3F) as u32) - } - 4 => { - ((bytes[position] & 0x07) as u32) << 18 - | ((bytes[position + 1] & 0x3F) as u32) << 12 - | ((bytes[position + 2] & 0x3F) as u32) << 6 - | ((bytes[position + 3] & 0x3F) as u32) - } - _ => unreachable!(), // this should never happen!! - } -} - -fn adjust_index(index: i64, len: usize) -> usize { - if index > 0 { - index.saturating_sub(1) as usize - } else if index < 0 { - len.saturating_sub(index.unsigned_abs() as usize) - } else { - 0 - } +fn convert_index_end(i: i64, len: usize) -> Option { + let val = match i { + v @ 0.. => v, + v @ ..=-1 => (len as i64 + v + 1).max(0), + }; + usize::try_from(val).ok() } pub fn load_utf8(ctx: Context) { @@ -193,22 +132,19 @@ pub fn load_utf8(ctx: Context) { return Ok(CallbackReturn::Return); } }; - let len = s.len(); - let i = i.unwrap_or(1); - let j = j.unwrap_or(-1); - - let start = adjust_index(i, len); - let end = adjust_index(j, len); + let start = convert_index(i.unwrap_or(1), len).unwrap_or(usize::MAX); + let end = convert_index_end(j.unwrap_or(len as i64), len) + .unwrap_or(usize::MAX) + .min(len); + // TODO: we need to check this conditions if start >= len || (end < start && end != 0) { stack.replace(ctx, 0); return Ok(CallbackReturn::Return); } - let end = end.min(len); - let s = &s[start..=end]; stack.replace(ctx, s.chars().count() as i64); @@ -221,70 +157,39 @@ pub fn load_utf8(ctx: Context) { ctx, "codepoint", Callback::from_fn(&ctx, |ctx, _, mut stack| { - fn is_valid_lua_index(index: i64, length: i64) -> bool { - if index == 0 { - false - } else if index > 0 { - index <= length - } else { - index >= -length - } - } - let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; - let bytes = s.as_bytes(); - let len = bytes.len(); + + let s = std::str::from_utf8(s.as_bytes()).map_err(|err| { + format!( + "bad argument #1 to 'codepoint' (invalid byte sequence at {})", + err.error_len().unwrap_or_default() + ) + .into_value(ctx) + })?; + let len = s.len(); let i = i.unwrap_or(1); let j = j.unwrap_or(i); - if !is_valid_lua_index(j, len as i64) { - return Err("bad argument #3 to 'codepoint' (out of bounds)" - .into_value(ctx) - .into()); - } + let start = convert_index(i, len).unwrap_or(usize::MAX); + let end = convert_index_end(j, len).unwrap_or(usize::MAX).min(len); - if !is_valid_lua_index(i, len as i64) { - return Err(format!("bad argument #2 to 'codepoint' (out of bounds)",) - .into_value(ctx) - .into()); + if start > len { + stack.replace(ctx, Value::Nil); + return Ok(CallbackReturn::Return); } - let start = adjust_index(i, len); - let end = adjust_index(j, len); + if start < 1 { + return Err("bad argument #2 (out of range)".into_value(ctx).into()); + } - if start >= len || end < start { - // Return empty result if normalized range is invalid + if start > end { return Ok(CallbackReturn::Return); } - let mut position = start; - let mut codepoints = Vec::new(); - - while position <= end { - if position >= len { - break; - } - - let byte = bytes[position]; - - let expected_bytes = utf8_sequence_length(ctx, byte, position)?; - - validate_utf8_sequence(ctx, position, expected_bytes, bytes)?; - - let code_point = decode_utf8_codepoint(position, expected_bytes, bytes); - - if position <= end { - codepoints.push(code_point as i64); - } - - position += expected_bytes; - } + let s = &s[start..=end]; - stack.clear(); - for codepoint in codepoints { - stack.push_back(Value::Integer(codepoint)); - } + stack.extend(s.chars().map(|c| Value::Integer(c as i64))); Ok(CallbackReturn::Return) }), @@ -306,7 +211,7 @@ pub fn load_utf8(ctx: Context) { .into()); } - let mut position = adjust_index(i, len); + let mut position = convert_index(i, len).unwrap_or(usize::MAX); if n != 0 && position < len && (bytes[position] & 0xC0) == 0x80 { return Err("initial position is a continuation byte" @@ -359,7 +264,7 @@ pub fn load_utf8(ctx: Context) { let target_count = -n; let mut count = 0i64; - let mut current_byte_index = adjust_index(i, len); + let mut current_byte_index = convert_index(i, len).unwrap_or(usize::MAX); while count < target_count { if current_byte_index == 0 { From ebe528eef2df248b39debb2edc32b983a5a4e7e4 Mon Sep 17 00:00:00 2001 From: lyranowl Date: Sun, 10 Aug 2025 20:51:14 +0500 Subject: [PATCH 12/12] `fix`: recommendations for change have been implemented --- src/stdlib/utf8.rs | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/stdlib/utf8.rs b/src/stdlib/utf8.rs index 9eb9e77b..8e3fe99c 100644 --- a/src/stdlib/utf8.rs +++ b/src/stdlib/utf8.rs @@ -94,12 +94,8 @@ pub fn load_utf8(ctx: Context) { if n == 0 { stack.replace(ctx, (1, c as i64)); } else { - if c.is_ascii() { - stack.replace(ctx, (n as i64 + 1, c as i64)); - } else { - let len = c.len_utf8(); - stack.replace(ctx, ((n + len) as i64, c as i64)); - } + let len = c.len_utf8(); + stack.replace(ctx, ((n + len) as i64, c as i64)); } Ok(CallbackReturn::Return) } else { @@ -124,14 +120,6 @@ pub fn load_utf8(ctx: Context) { Callback::from_fn(&ctx, |ctx, _, mut stack| { let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; - let s = match std::str::from_utf8(s.as_bytes()) { - Ok(s) => s, - Err(err) => { - let position = err.error_len().unwrap_or_default(); - stack.replace(ctx, (false, position as i64 + 1)); - return Ok(CallbackReturn::Return); - } - }; let len = s.len(); let start = convert_index(i.unwrap_or(1), len).unwrap_or(usize::MAX); @@ -147,6 +135,15 @@ pub fn load_utf8(ctx: Context) { let s = &s[start..=end]; + let s = match std::str::from_utf8(s.as_bytes()) { + Ok(s) => s, + Err(err) => { + let position = err.error_len().unwrap_or_default(); + stack.replace(ctx, (false, position as i64 + 1)); + return Ok(CallbackReturn::Return); + } + }; + stack.replace(ctx, s.chars().count() as i64); Ok(CallbackReturn::Return) @@ -159,13 +156,6 @@ pub fn load_utf8(ctx: Context) { Callback::from_fn(&ctx, |ctx, _, mut stack| { let (s, i, j) = stack.consume::<(String, Option, Option)>(ctx)?; - let s = std::str::from_utf8(s.as_bytes()).map_err(|err| { - format!( - "bad argument #1 to 'codepoint' (invalid byte sequence at {})", - err.error_len().unwrap_or_default() - ) - .into_value(ctx) - })?; let len = s.len(); let i = i.unwrap_or(1); @@ -189,6 +179,14 @@ pub fn load_utf8(ctx: Context) { let s = &s[start..=end]; + let s = std::str::from_utf8(s.as_bytes()).map_err(|err| { + format!( + "bad argument #1 to 'codepoint' (invalid byte sequence at {})", + err.error_len().unwrap_or_default() + ) + .into_value(ctx) + })?; + stack.extend(s.chars().map(|c| Value::Integer(c as i64))); Ok(CallbackReturn::Return)