diff --git a/core/engine/src/builtins/intl/segmenter/iterator.rs b/core/engine/src/builtins/intl/segmenter/iterator.rs index 330869143c4..a90715b5a86 100644 --- a/core/engine/src/builtins/intl/segmenter/iterator.rs +++ b/core/engine/src/builtins/intl/segmenter/iterator.rs @@ -127,7 +127,7 @@ impl SegmentIterator { .segmenter .downcast_ref::() .expect("segment iterator object should contain a segmenter"); - let mut segments = segmenter.native.segment(string); + let mut segments = segmenter.native.segment(string.variant()); // the first elem is always 0. segments.next(); segments diff --git a/core/engine/src/builtins/intl/segmenter/mod.rs b/core/engine/src/builtins/intl/segmenter/mod.rs index d73bcd04760..2846506ec48 100644 --- a/core/engine/src/builtins/intl/segmenter/mod.rs +++ b/core/engine/src/builtins/intl/segmenter/mod.rs @@ -1,15 +1,7 @@ use std::ops::Range; -use boa_gc::{Finalize, Trace}; -use icu_collator::provider::CollationDiacriticsV1; -use icu_locale::Locale; -use icu_segmenter::{ - GraphemeClusterSegmenter, SentenceSegmenter, WordSegmenter, - options::{SentenceBreakOptions, WordBreakOptions}, -}; - use crate::{ - Context, JsArgs, JsData, JsNativeError, JsResult, JsStr, JsString, JsSymbol, JsValue, + Context, JsArgs, JsData, JsNativeError, JsResult, JsString, JsSymbol, JsValue, builtins::{ BuiltInBuilder, BuiltInConstructor, BuiltInObject, IntrinsicObject, options::{get_option, get_options_object}, @@ -21,6 +13,14 @@ use crate::{ realm::Realm, string::StaticJsStrings, }; +use boa_gc::{Finalize, Trace}; +use boa_string::JsStrVariant; +use icu_collator::provider::CollationDiacriticsV1; +use icu_locale::Locale; +use icu_segmenter::{ + GraphemeClusterSegmenter, SentenceSegmenter, WordSegmenter, + options::{SentenceBreakOptions, WordBreakOptions}, +}; mod iterator; mod options; @@ -62,9 +62,12 @@ impl NativeSegmenter { /// Segment the passed string, returning an iterator with the index boundaries /// of the segments. - pub(crate) fn segment<'l, 's>(&'l self, input: JsStr<'s>) -> NativeSegmentIterator<'l, 's> { - match input.variant() { - crate::string::JsStrVariant::Latin1(input) => match self { + pub(crate) fn segment<'l, 's>( + &'l self, + input: JsStrVariant<'s>, + ) -> NativeSegmentIterator<'l, 's> { + match input { + JsStrVariant::Latin1(input) => match self { Self::Grapheme(g) => { NativeSegmentIterator::GraphemeLatin1(g.as_borrowed().segment_latin1(input)) } @@ -75,7 +78,7 @@ impl NativeSegmenter { NativeSegmentIterator::SentenceLatin1(s.as_borrowed().segment_latin1(input)) } }, - crate::string::JsStrVariant::Utf16(input) => match self { + JsStrVariant::Utf16(input) => match self { Self::Grapheme(g) => { NativeSegmentIterator::GraphemeUtf16(g.as_borrowed().segment_utf16(input)) } diff --git a/core/engine/src/builtins/intl/segmenter/segments.rs b/core/engine/src/builtins/intl/segmenter/segments.rs index ff2ee4846c5..c71522730a6 100644 --- a/core/engine/src/builtins/intl/segmenter/segments.rs +++ b/core/engine/src/builtins/intl/segmenter/segments.rs @@ -88,7 +88,7 @@ impl Segments { // 8. Let startIndex be ! FindBoundary(segmenter, string, n, before). // 9. Let endIndex be ! FindBoundary(segmenter, string, n, after). let (range, is_word_like) = { - let mut segments = segmenter.native.segment(segments.string.as_str()); + let mut segments = segmenter.native.segment(segments.string.variant()); std::iter::from_fn(|| segments.next().map(|i| (i, segments.is_word_like()))) .tuple_windows() .find(|((i, _), (j, _))| (*i..*j).contains(&n)) diff --git a/core/engine/src/builtins/json/mod.rs b/core/engine/src/builtins/json/mod.rs index ea49e05ff27..1a8da715598 100644 --- a/core/engine/src/builtins/json/mod.rs +++ b/core/engine/src/builtins/json/mod.rs @@ -386,7 +386,7 @@ impl Json { // 7. Else if Type(space) is String, then } else if let Some(s) = space.as_string() { // a. If the length of space is 10 or less, let gap be space; otherwise let gap be the substring of space from 0 to 10. - js_string!(s.get(..10).unwrap_or(s.as_str())) + s.get(..10).unwrap_or(s) // 8. Else, } else { // a. Let gap be the empty String. diff --git a/core/engine/src/builtins/number/globals.rs b/core/engine/src/builtins/number/globals.rs index 4ef72b4323f..39c8c92b611 100644 --- a/core/engine/src/builtins/number/globals.rs +++ b/core/engine/src/builtins/number/globals.rs @@ -247,7 +247,7 @@ pub(crate) fn parse_int(_: &JsValue, args: &[JsValue], context: &mut Context) -> // 0 digit, at the option of the implementation; and if R is not 2, 4, 8, 10, 16, or 32, then // mathInt may be an implementation-approximated value representing the integer value that is // represented by Z in radix-R notation.) - let math_int = from_js_str_radix(z, r).expect("Already checked"); + let math_int = from_js_str_radix(z.as_str(), r).expect("Already checked"); // 15. If mathInt = 0, then // a. If sign = -1, return -0𝔽. @@ -303,11 +303,6 @@ pub(crate) fn parse_float( args: &[JsValue], context: &mut Context, ) -> JsResult { - const PLUS_CHAR: u16 = b'+' as u16; - const MINUS_CHAR: u16 = b'-' as u16; - const LOWER_CASE_I_CHAR: u16 = b'i' as u16; - const UPPER_CASE_I_CHAR: u16 = b'I' as u16; - let Some(string) = args.first() else { return Ok(JsValue::nan()); }; @@ -333,10 +328,23 @@ pub(crate) fn parse_float( // 5. Let parsedNumber be ParseText(trimmedPrefix, StrDecimalLiteral). // 6. Assert: parsedNumber is a Parse Node. // 7. Return the StringNumericValue of parsedNumber. - let (positive, prefix) = match trimmed_string.get(0) { - Some(PLUS_CHAR) => (true, trimmed_string.get(1..).unwrap_or(JsStr::latin1(&[]))), - Some(MINUS_CHAR) => (false, trimmed_string.get(1..).unwrap_or(JsStr::latin1(&[]))), - _ => (true, trimmed_string), + let (positive, prefix) = match trimmed_string + .code_unit_at(0) + .and_then(|c| char::from_u32(u32::from(c))) + { + Some('+') => ( + true, + trimmed_string + .get(1..) + .unwrap_or(StaticJsStrings::EMPTY_STRING), + ), + Some('-') => ( + false, + trimmed_string + .get(1..) + .unwrap_or(StaticJsStrings::EMPTY_STRING), + ), + _ => (true, trimmed_string.clone()), }; if prefix.starts_with(js_str!("Infinity")) { @@ -344,7 +352,10 @@ pub(crate) fn parse_float( return Ok(JsValue::positive_infinity()); } return Ok(JsValue::negative_infinity()); - } else if let Some(LOWER_CASE_I_CHAR | UPPER_CASE_I_CHAR) = prefix.get(0) { + } else if let Some('I' | 'i') = prefix + .code_unit_at(0) + .and_then(|c| char::from_u32(u32::from(c))) + { return Ok(JsValue::nan()); } diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 786db22ed6f..925bc78bfaf 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -1788,7 +1788,7 @@ impl RegExp { // 17. Return the string-concatenation of accumulatedResult and the substring of S from nextSourcePosition. Ok(js_string!( &JsString::from(&accumulated_result[..]), - s.get_expect(next_source_position..) + &s.get_expect(next_source_position..) ) .into()) } diff --git a/core/engine/src/builtins/string/mod.rs b/core/engine/src/builtins/string/mod.rs index 1d94e058433..b166085003e 100644 --- a/core/engine/src/builtins/string/mod.rs +++ b/core/engine/src/builtins/string/mod.rs @@ -620,11 +620,10 @@ impl String { match position { // 4. Let size be the length of S. - IntegerOrInfinity::Integer(i) if i >= 0 => { + IntegerOrInfinity::Integer(i) if i >= 0 && i < string.len() as i64 => { // 6. Return the Number value for the numeric value of the code unit at index position within the String S. - Ok(string - .get(i as usize) - .map_or_else(JsValue::nan, JsValue::from)) + // SAFETY: already validated the index. + Ok(unsafe { string.code_unit_at(i as usize).unwrap_unchecked() }.into()) } // 5. If position < 0 or position ≥ size, return NaN. _ => Ok(JsValue::nan()), @@ -1043,7 +1042,7 @@ impl String { }; // 10. Let preserved be the substring of string from 0 to position. - let preserved = JsString::from(string.get_expect(..position)); + let preserved = string.get_expect(..position); let replacement = match replace_value { // 11. If functionalReplace is true, then @@ -1080,7 +1079,7 @@ impl String { Ok(js_string!( &preserved, &replacement, - &JsString::from(string.get_expect(position + search_length..)) + &string.get_expect(position + search_length..) ) .into()) } @@ -1675,7 +1674,7 @@ impl String { // 2. Return ? TrimString(S, end). let object = this.require_object_coercible()?; let string = object.to_string(context)?; - Ok(js_string!(string.trim_end()).into()) + Ok(string.trim_end().into()) } /// [`String.prototype.toUpperCase()`][upper] and [`String.prototype.toLowerCase()`][lower] @@ -1957,9 +1956,8 @@ impl String { if separator_length == 0 { // a. Let head be the substring of S from 0 to lim. // b. Let codeUnits be a List consisting of the sequence of code units that are the elements of head. - let head = this_str - .get(..lim) - .unwrap_or(this_str.as_str()) + let head_str = this_str.get(..lim).unwrap_or(this_str); + let head = head_str .iter() .map(|code| js_string!(std::slice::from_ref(&code)).into()); diff --git a/core/engine/src/builtins/string/tests.rs b/core/engine/src/builtins/string/tests.rs index 05c2fa414c3..2a3a0cb1921 100644 --- a/core/engine/src/builtins/string/tests.rs +++ b/core/engine/src/builtins/string/tests.rs @@ -761,7 +761,7 @@ fn char_at() { #[test] fn char_code_at() { run_test_actions([ - TestAction::assert_eq("'abc'.charCodeAt-1", f64::NAN), + TestAction::assert_eq("'abc'.charCodeAt(-1)", f64::NAN), TestAction::assert_eq("'abc'.charCodeAt(1)", 98), TestAction::assert_eq("'abc'.charCodeAt(9)", f64::NAN), TestAction::assert_eq("'abc'.charCodeAt()", 97), diff --git a/core/engine/src/builtins/uri/mod.rs b/core/engine/src/builtins/uri/mod.rs index 74a313939c6..bd8ef4bcbee 100644 --- a/core/engine/src/builtins/uri/mod.rs +++ b/core/engine/src/builtins/uri/mod.rs @@ -309,7 +309,7 @@ where } // b. Let C be the code unit at index k within string. - let c = string.get_expect(k); + let c = string.code_unit_at(k).expect("Bounds were verified"); // c. If C is in unescapedSet, then if unescaped_set(c) { @@ -384,7 +384,7 @@ where } // b. Let C be the code unit at index k within string. - let c = string.get_expect(k); + let c = string.code_point_at(k).as_u32() as u16; // c. If C is not the code unit 0x0025 (PERCENT SIGN), then #[allow(clippy::if_not_else)] @@ -406,10 +406,17 @@ where // iii. If the code units at index (k + 1) and (k + 2) within string do not represent // hexadecimal digits, throw a URIError exception. // iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_hex_byte(string.get_expect(k + 1), string.get_expect(k + 2)) - .ok_or_else(|| { - JsNativeError::uri().with_message("invalid hexadecimal digit found") - })?; + + // SAFETY: the indices have been verified as valid already. + let (high, low) = unsafe { + ( + string.code_unit_at(k + 1).unwrap_unchecked(), + string.code_unit_at(k + 2).unwrap_unchecked(), + ) + }; + let b = decode_hex_byte(high, low).ok_or_else(|| { + JsNativeError::uri().with_message("invalid hexadecimal digit found") + })?; // v. Set k to k + 2. k += 2; @@ -457,7 +464,7 @@ where k += 1; // b. If the code unit at index k within string is not the code unit 0x0025 (PERCENT SIGN), throw a URIError exception. - if string.get_expect(k) != 0x0025 { + if string.code_unit_at(k) != Some(0x0025) { return Err(JsNativeError::uri() .with_message("escape characters must be preceded with a % sign") .into()); @@ -465,10 +472,16 @@ where // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_hex_byte(string.get_expect(k + 1), string.get_expect(k + 2)) - .ok_or_else(|| { - JsNativeError::uri().with_message("invalid hexadecimal digit found") - })?; + // SAFETY: the indices have been verified as valid already. + let (high, low) = unsafe { + ( + string.code_unit_at(k + 1).unwrap_unchecked(), + string.code_unit_at(k + 2).unwrap_unchecked(), + ) + }; + let b = decode_hex_byte(high, low).ok_or_else(|| { + JsNativeError::uri().with_message("invalid hexadecimal digit found") + })?; // e. Set k to k + 2. k += 2; diff --git a/core/engine/src/vm/opcode/get/property.rs b/core/engine/src/vm/opcode/get/property.rs index 8429f6892d3..2fa56335415 100644 --- a/core/engine/src/vm/opcode/get/property.rs +++ b/core/engine/src/vm/opcode/get/property.rs @@ -114,7 +114,7 @@ fn get_by_value( } } else if let Some(string) = base.as_string() { let value = string - .get(index.get() as usize) + .code_unit_at(index.get() as usize) .map_or_else(JsValue::undefined, |char| { js_string!([char].as_slice()).into() }); diff --git a/core/string/src/code_point.rs b/core/string/src/code_point.rs index 57970a0d39e..54ab6238d14 100644 --- a/core/string/src/code_point.rs +++ b/core/string/src/code_point.rs @@ -76,3 +76,16 @@ impl std::fmt::Display for CodePoint { } } } + +impl From for CodePoint { + fn from(value: char) -> Self { + Self::Unicode(value) + } +} + +impl From for CodePoint { + fn from(value: u16) -> Self { + char::from_u32(u32::from(value)) + .map_or_else(|| CodePoint::UnpairedSurrogate(value), CodePoint::Unicode) + } +} diff --git a/core/string/src/display.rs b/core/string/src/display.rs index fac511b4649..17e585db03a 100644 --- a/core/string/src/display.rs +++ b/core/string/src/display.rs @@ -5,10 +5,11 @@ use std::cell::RefCell; use std::fmt; use std::fmt::Write; -/// Display implementation for [`JsString`] that escapes unicode characters. -#[derive(Debug)] +/// `Display` implementation for [`JsString`] that escapes unicode characters. +// This should not implement debug, only be shown as a standard display. +#[allow(missing_debug_implementations)] pub struct JsStrDisplayEscaped<'a> { - inner: JsStr<'a>, + inner: &'a JsString, } impl fmt::Display for JsStrDisplayEscaped<'_> { @@ -30,14 +31,15 @@ impl fmt::Display for JsStrDisplayEscaped<'_> { } } -impl<'a> From> for JsStrDisplayEscaped<'a> { - fn from(inner: JsStr<'a>) -> Self { +impl<'a> From<&'a JsString> for JsStrDisplayEscaped<'a> { + fn from(inner: &'a JsString) -> Self { Self { inner } } } -/// Display implementation for [`JsString`] that escapes unicode characters. -#[derive(Debug)] +/// `Display` implementation for [`JsString`] that escapes unicode characters. +// This should not implement debug, only be shown as a standard display. +#[allow(missing_debug_implementations)] pub struct JsStrDisplayLossy<'a> { inner: JsStr<'a>, } @@ -117,35 +119,35 @@ impl<'a> From<&'a JsString> for JsStringDebugInfo<'a> { #[test] fn latin1() { // 0xE9 is `é` in ISO-8859-1 (see https://www.ascii-code.com/ISO-8859-1). - let s = JsStr::latin1(b"Hello \xE9 world!"); + let s = JsString::from("Hello \u{E9} world!"); - let rust_str = format!("{}", JsStrDisplayEscaped { inner: s }); + let rust_str = format!("{}", JsStrDisplayEscaped { inner: &s }); assert_eq!(rust_str, "Hello é world!"); - let rust_str = format!("{}", JsStrDisplayLossy { inner: s }); + let rust_str = format!("{}", JsStrDisplayLossy { inner: s.as_str() }); assert_eq!(rust_str, "Hello é world!"); } #[test] fn emoji() { // 0x1F600 is `😀` (see https://www.fileformat.info/info/unicode/char/1f600/index.htm). - let s = JsStr::utf16(&[0xD83D, 0xDE00]); + let s = JsString::from(&[0xD83D, 0xDE00]); - let rust_str = format!("{}", JsStrDisplayEscaped { inner: s }); + let rust_str = format!("{}", JsStrDisplayEscaped { inner: &s }); assert_eq!(rust_str, "😀"); - let rust_str = format!("{}", JsStrDisplayLossy { inner: s }); + let rust_str = format!("{}", JsStrDisplayLossy { inner: s.as_str() }); assert_eq!(rust_str, "😀"); } #[test] fn unpaired_surrogates() { // 0xD800 is an unpaired surrogate (see https://www.fileformat.info/info/unicode/char/d800/index.htm). - let s = JsStr::utf16(&[0xD800]); + let s = JsString::from(&[0xD800]); - let rust_str = format!("{}", JsStrDisplayEscaped { inner: s }); + let rust_str = format!("{}", JsStrDisplayEscaped { inner: &s }); assert_eq!(rust_str, "\\uD800"); - let rust_str = format!("{}", JsStrDisplayLossy { inner: s }); + let rust_str = format!("{}", JsStrDisplayLossy { inner: s.as_str() }); assert_eq!(rust_str, "�"); } diff --git a/core/string/src/iter.rs b/core/string/src/iter.rs index e90c394d7b7..d327b06187d 100644 --- a/core/string/src/iter.rs +++ b/core/string/src/iter.rs @@ -17,6 +17,7 @@ pub struct Iter<'a> { } impl<'a> Iter<'a> { + #[inline] pub(crate) fn new(s: JsStr<'a>) -> Self { let inner = match s.variant() { JsStrVariant::Latin1(s) => IterInner::U8(s.iter().copied()), @@ -65,6 +66,7 @@ pub struct Windows<'a> { } impl<'a> Windows<'a> { + #[inline] pub(crate) fn new(string: JsStr<'a>, size: usize) -> Self { let inner = match string.variant() { JsStrVariant::Latin1(v) => WindowsInner::U8(v.windows(size)), @@ -110,6 +112,7 @@ pub struct CodePointsIter<'a> { } impl<'a> CodePointsIter<'a> { + #[inline] pub(crate) fn new(s: JsStr<'a>) -> Self { let inner = match s.variant() { JsStrVariant::Latin1(s) => CodePointsIterInner::Latin1(s.iter().copied()), diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index d4ff6538c3a..2f7978fe339 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -24,8 +24,9 @@ mod vtable; #[cfg(test)] mod tests; -use self::{iter::Windows, str::JsSliceIndex}; +use self::iter::Windows; use crate::display::{JsStrDisplayEscaped, JsStrDisplayLossy, JsStringDebugInfo}; +use crate::iter::CodePointsIter; use crate::r#type::{Latin1, Utf16}; pub use crate::vtable::StaticString; use crate::vtable::{SequenceString, SliceString}; @@ -234,8 +235,36 @@ impl JsString { /// Decodes a [`JsString`] into an iterator of [`Result`], returning surrogates as /// errors. #[inline] - pub fn to_std_string_with_surrogates(&self) -> impl Iterator> + '_ { - self.as_str().to_std_string_with_surrogates() + #[allow(clippy::missing_panics_doc)] + pub fn to_std_string_with_surrogates( + &self, + ) -> impl Iterator> + use<'_> { + let mut iter = self.code_points().peekable(); + + std::iter::from_fn(move || { + let cp = iter.next()?; + let char = match cp { + CodePoint::Unicode(c) => c, + CodePoint::UnpairedSurrogate(surr) => return Some(Err(surr)), + }; + + let mut string = String::from(char); + + loop { + let Some(cp) = iter.peek().and_then(|cp| match cp { + CodePoint::Unicode(c) => Some(*c), + CodePoint::UnpairedSurrogate(_) => None, + }) else { + break; + }; + + string.push(cp); + + iter.next().expect("should exist by the check above"); + } + + Some(Ok(string)) + }) } /// Maps the valid segments of an UTF16 string and leaves the unpaired surrogates unchanged. @@ -259,8 +288,16 @@ impl JsString { /// Gets an iterator of all the Unicode codepoints of a [`JsString`]. #[inline] - pub fn code_points(&self) -> impl Iterator + Clone + '_ { - self.as_str().code_points() + #[must_use] + pub fn code_points(&self) -> CodePointsIter<'_> { + (self.vtable().code_points)(self.ptr) + } + + /// Get the variant of this string. + #[inline] + #[must_use] + pub fn variant(&self) -> JsStrVariant<'_> { + self.as_str().variant() } /// Abstract operation `StringIndexOf ( string, searchValue, fromIndex )` @@ -343,61 +380,124 @@ impl JsString { /// Trim whitespace from the start and end of the [`JsString`]. #[inline] #[must_use] - pub fn trim(&self) -> JsStr<'_> { - self.as_str().trim() + pub fn trim(&self) -> JsString { + // Calculate both bounds directly to avoid intermediate allocations. + let (start, end) = match self.variant() { + JsStrVariant::Latin1(v) => { + let Some(start) = v.iter().position(|c| !is_trimmable_whitespace_latin1(*c)) else { + return StaticJsStrings::EMPTY_STRING; + }; + let end = v + .iter() + .rposition(|c| !is_trimmable_whitespace_latin1(*c)) + .unwrap_or(start); + (start, end) + } + JsStrVariant::Utf16(v) => { + let Some(start) = v.iter().copied().position(|r| { + !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace) + }) else { + return StaticJsStrings::EMPTY_STRING; + }; + let end = v + .iter() + .copied() + .rposition(|r| { + !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace) + }) + .unwrap_or(start); + (start, end) + } + }; + + // SAFETY: `position(...)` and `rposition(...)` cannot exceed the length of the string. + unsafe { Self::slice_unchecked(self, start, end + 1) } } /// Trim whitespace from the start of the [`JsString`]. #[inline] #[must_use] - pub fn trim_start(&self) -> JsStr<'_> { - self.as_str().trim_start() + pub fn trim_start(&self) -> JsString { + let Some(start) = (match self.variant() { + JsStrVariant::Latin1(v) => v.iter().position(|c| !is_trimmable_whitespace_latin1(*c)), + JsStrVariant::Utf16(v) => v + .iter() + .copied() + .position(|r| !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace)), + }) else { + return StaticJsStrings::EMPTY_STRING; + }; + + // SAFETY: `position(...)` cannot exceed the length of the string. + unsafe { Self::slice_unchecked(self, start, self.len()) } } /// Trim whitespace from the end of the [`JsString`]. #[inline] #[must_use] - pub fn trim_end(&self) -> JsStr<'_> { - self.as_str().trim_end() + pub fn trim_end(&self) -> JsString { + let Some(end) = (match self.variant() { + JsStrVariant::Latin1(v) => v.iter().rposition(|c| !is_trimmable_whitespace_latin1(*c)), + JsStrVariant::Utf16(v) => v + .iter() + .copied() + .rposition(|r| !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace)), + }) else { + return StaticJsStrings::EMPTY_STRING; + }; + + // SAFETY: `rposition(...)` cannot exceed the length of the string. `end` is the first + // character that is not trimmable, therefore we need to add 1 to it. + unsafe { Self::slice_unchecked(self, 0, end + 1) } } - /// Get the element a the given index, [`None`] otherwise. + /// Returns true if needle is a prefix of the [`JsStr`]. #[inline] #[must_use] - pub fn get<'a, I>(&'a self, index: I) -> Option - where - I: JsSliceIndex<'a>, - { + // We check the size, so this should never panic. + #[allow(clippy::missing_panics_doc)] + pub fn starts_with(&self, needle: JsStr<'_>) -> bool { + self.as_str().starts_with(needle) + } + + /// Returns `true` if `needle` is a suffix of the [`JsStr`]. + #[inline] + #[must_use] + // We check the size, so this should never panic. + #[allow(clippy::missing_panics_doc)] + pub fn ends_with(&self, needle: JsStr<'_>) -> bool { + self.as_str().starts_with(needle) + } + + /// Get the `u16` code unit at index. This does not parse any characters if there + /// are pairs, it is simply the index of the `u16` elements. + #[inline] + #[must_use] + pub fn code_unit_at(&self, index: usize) -> Option { self.as_str().get(index) } - /// Returns an element or subslice depending on the type of index, without doing bounds check. - /// - /// # Safety - /// - /// Caller must ensure the index is not out of bounds + /// Get the element at the given index, or [`None`] if the index is out of range. #[inline] #[must_use] - pub unsafe fn get_unchecked<'a, I>(&'a self, index: I) -> I::Value + pub fn get(&self, index: I) -> Option where - I: JsSliceIndex<'a>, + I: JsStringSliceIndex, { - // SAFETY: Caller must ensure the index is not out of bounds - unsafe { self.as_str().get_unchecked(index) } + index.get(self) } - /// Get the element a the given index. + /// Get the element at the given index, or panic. /// /// # Panics - /// - /// If the index is out of bounds. + /// If the index returns `None`, this will panic. #[inline] #[must_use] - pub fn get_expect<'a, I>(&'a self, index: I) -> I::Value + pub fn get_expect(&self, index: I) -> JsString where - I: JsSliceIndex<'a>, + I: JsStringSliceIndex, { - self.as_str().get_expect(index) + index.get(self).expect("Unexpected get()") } /// Gets a displayable escaped string. This may be faster and has fewer @@ -406,7 +506,7 @@ impl JsString { #[inline] #[must_use] pub fn display_escaped(&self) -> JsStrDisplayEscaped<'_> { - self.as_str().display_escaped() + JsStrDisplayEscaped::from(self) } /// Gets a displayable lossy string. This may be faster and has fewer @@ -726,7 +826,7 @@ impl std::fmt::Debug for JsString { #[inline] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_tuple("JsString") - .field(&self.display_escaped()) + .field(&self.display_escaped().to_string()) .finish() } } @@ -936,3 +1036,48 @@ impl FromStr for JsString { Ok(Self::from(s)) } } + +/// Similar to [`std::ops::RangeBounds`] but custom implemented for getting direct indices. +// TODO: remove [`str::JsSliceIndex`] and rename this when `JsStr` is no more. +pub trait JsStringSliceIndex { + /// Get the substring (or `None` if outside the string). + fn get(self, str: &JsString) -> Option; +} + +macro_rules! impl_js_string_slice_index { + ($($type:ty),+ $(,)?) => { + $( + impl JsStringSliceIndex for $type { + fn get(self, str: &JsString) -> Option { + let start = match std::ops::RangeBounds::::start_bound(&self) { + std::ops::Bound::Included(start) => *start, + std::ops::Bound::Excluded(start) => *start + 1, + std::ops::Bound::Unbounded => 0, + }; + + let end = match std::ops::RangeBounds::::end_bound(&self) { + std::ops::Bound::Included(end) => *end + 1, + std::ops::Bound::Excluded(end) => *end, + std::ops::Bound::Unbounded => str.len(), + }; + + if end > str.len() || start > end { + None + } else { + // SAFETY: we just checked the indices. + Some(unsafe { JsString::slice_unchecked(str, start, end) }) + } + } + } + )+ + }; +} + +impl_js_string_slice_index!( + std::ops::Range, + std::ops::RangeInclusive, + std::ops::RangeTo, + std::ops::RangeToInclusive, + std::ops::RangeFrom, + std::ops::RangeFull, +); diff --git a/core/string/src/str.rs b/core/string/src/str.rs index a766767aebf..3ed0176ce93 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -1,9 +1,5 @@ use super::iter::{CodePointsIter, Windows}; -use crate::{ - CodePoint, Iter, TaggedLen, - display::{JsStrDisplayEscaped, JsStrDisplayLossy}, - is_trimmable_whitespace, is_trimmable_whitespace_latin1, -}; +use crate::{CodePoint, Iter, TaggedLen, display::JsStrDisplayLossy, is_trimmable_whitespace}; use std::ptr::NonNull; use std::{ hash::{Hash, Hasher}, @@ -11,36 +7,6 @@ use std::{ slice::SliceIndex, }; -// Modified port of -#[inline] -pub(crate) const fn trim_latin1_start(mut bytes: &[u8]) -> &[u8] { - // Note: A pattern matching based approach (instead of indexing) allows - // making the function const. - while let [first, rest @ ..] = bytes { - if is_trimmable_whitespace_latin1(*first) { - bytes = rest; - } else { - break; - } - } - bytes -} - -// Modified port of -#[inline] -pub(crate) const fn trim_latin1_end(mut bytes: &[u8]) -> &[u8] { - // Note: A pattern matching based approach (instead of indexing) allows - // making the function const. - while let [rest @ .., last] = bytes { - if is_trimmable_whitespace_latin1(*last) { - bytes = rest; - } else { - break; - } - } - bytes -} - /// Inner representation of a [`JsStr`]. #[derive(Debug, Clone, Copy)] pub enum JsStrVariant<'a> { @@ -183,53 +149,6 @@ impl<'a> JsStr<'a> { self.len() == 0 } - /// Trims both leading and trailing space. - #[inline] - #[must_use] - pub fn trim(self) -> JsStr<'a> { - self.trim_start().trim_end() - } - - /// Trims all leading space. - #[inline] - #[must_use] - pub fn trim_start(self) -> Self { - match self.variant() { - JsStrVariant::Latin1(s) => Self::latin1(trim_latin1_start(s)), - JsStrVariant::Utf16(s) => { - let value = if let Some(left) = s.iter().copied().position(|r| { - !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace) - }) { - &s[left..] - } else { - return Self::EMPTY; - }; - - Self::utf16(value) - } - } - } - - /// Trims all trailing space. - #[inline] - #[must_use] - pub fn trim_end(self) -> Self { - match self.variant() { - JsStrVariant::Latin1(s) => Self::latin1(trim_latin1_end(s)), - JsStrVariant::Utf16(s) => { - let value = if let Some(right) = s.iter().copied().rposition(|r| { - !char::from_u32(u32::from(r)).is_some_and(is_trimmable_whitespace) - }) { - &s[..=right] - } else { - return Self::EMPTY; - }; - - Self::utf16(value) - } - } - } - /// Returns an element or subslice depending on the type of index, otherwise [`None`]. #[inline] #[must_use] @@ -483,41 +402,6 @@ impl<'a> JsStr<'a> { char::decode_utf16(self.iter()).map(|res| res.unwrap_or('\u{FFFD}')) } - /// Decodes a [`JsStr`] into an iterator of [`Result`], returning surrogates as - /// errors. - #[inline] - #[allow(clippy::missing_panics_doc)] - pub fn to_std_string_with_surrogates( - &self, - ) -> impl Iterator> + use<'a> { - let mut iter = self.code_points().peekable(); - - std::iter::from_fn(move || { - let cp = iter.next()?; - let char = match cp { - CodePoint::Unicode(c) => c, - CodePoint::UnpairedSurrogate(surr) => return Some(Err(surr)), - }; - - let mut string = String::from(char); - - loop { - let Some(cp) = iter.peek().and_then(|cp| match cp { - CodePoint::Unicode(c) => Some(*c), - CodePoint::UnpairedSurrogate(_) => None, - }) else { - break; - }; - - string.push(cp); - - iter.next().expect("should exist by the check above"); - } - - Some(Ok(string)) - }) - } - /// Decodes a [`JsStr`] into a [`String`], returning an error if it contains any invalid data. /// /// # Errors @@ -531,14 +415,6 @@ impl<'a> JsStr<'a> { } } - /// Decodes a [`JsStr`] into a [`String`], replacing invalid data with its escaped representation - /// in 4 digit hexadecimal. - #[inline] - #[must_use] - pub fn to_std_string_escaped(&self) -> String { - self.display_escaped().to_string() - } - /// Decodes a [`JsStr`] into a [`String`], replacing invalid data with the /// replacement character U+FFFD. #[inline] @@ -547,17 +423,6 @@ impl<'a> JsStr<'a> { self.display_lossy().to_string() } - /// Gets a displayable escaped string. - /// - /// This may be faster and has fewer - /// allocations than `format!("{}", str.to_string_escaped())` when - /// displaying. - #[inline] - #[must_use] - pub fn display_escaped(&self) -> JsStrDisplayEscaped<'a> { - JsStrDisplayEscaped::from(*self) - } - /// Gets a displayable lossy string. /// /// This may be faster and has fewer @@ -659,7 +524,7 @@ impl<'a> PartialEq> for [u16] { impl std::fmt::Debug for JsStr<'_> { #[inline] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.to_std_string_escaped().fmt(f) + f.debug_struct("JsStr").field("len", &self.len()).finish() } } diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 0227c2cb275..43d0d3268d9 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -145,7 +145,7 @@ fn trim_start_non_ascii_to_ascii() { let s = "\u{2029}abc"; let x = JsString::from(s); - let y = JsString::from(x.trim_start()); + let y = x.trim_start(); assert_eq!(&y, s.trim_start()); } @@ -546,3 +546,10 @@ fn split() { drop(base_str); assert_eq!(str3, JsString::from("Hello")); } + +#[test] +fn trim() { + // Very basic test for trimming. The extensive testing is done by `boa_engine`. + let base_str = JsString::from(" \u{000B} Hello World \t "); + assert_eq!(base_str.trim(), JsString::from("Hello World")); +} diff --git a/core/string/src/vtable/mod.rs b/core/string/src/vtable/mod.rs index f3979320dc8..3e5f9151426 100644 --- a/core/string/src/vtable/mod.rs +++ b/core/string/src/vtable/mod.rs @@ -1,4 +1,5 @@ //! Module defining the [`JsString`] `VTable` and kinds of strings. +use crate::iter::CodePointsIter; use crate::{JsStr, JsString, JsStringKind}; use std::ptr::NonNull; @@ -24,6 +25,10 @@ pub(crate) struct JsStringVTable { /// of the lifetime of the string itself. This is conveyed by the [`JsString`] API /// itself rather than this vtable. pub as_str: fn(NonNull) -> JsStr<'static>, + /// Get an iterator of code points. This is the basic form of character access. + /// Although this is marked as `'static`, this is really of the lifetime of the string + /// itself. This is conveyed by the [`JsString`] API itself rather than this vtable. + pub code_points: fn(NonNull) -> CodePointsIter<'static>, /// Get the refcount, if applicable. pub refcount: fn(NonNull) -> Option, /// Get the length of the string. Since a string is immutable, this does not need diff --git a/core/string/src/vtable/sequence.rs b/core/string/src/vtable/sequence.rs index 2f24c97c61b..dea448c2d16 100644 --- a/core/string/src/vtable/sequence.rs +++ b/core/string/src/vtable/sequence.rs @@ -1,4 +1,5 @@ //! `VTable` implementations for [`SequenceString`]. +use crate::iter::CodePointsIter; use crate::r#type::InternalStringType; use crate::vtable::JsStringVTable; use crate::{JsStr, JsString, alloc_overflow}; @@ -36,6 +37,7 @@ impl SequenceString { clone: seq_clone::, drop: seq_drop::, as_str: seq_as_str::, + code_points: seq_code_points::, refcount: seq_refcount::, len, kind: T::KIND, @@ -122,6 +124,7 @@ impl SequenceString { } } +#[inline] fn seq_clone(vtable: NonNull) -> JsString { // SAFETY: This is part of the correct vtable which is validated on construction. let this: &SequenceString = unsafe { vtable.cast().as_ref() }; @@ -133,6 +136,7 @@ fn seq_clone(vtable: NonNull) -> JsString unsafe { JsString::from_ptr(vtable) } } +#[inline] fn seq_drop(vtable: NonNull) { // SAFETY: This is part of the correct vtable which is validated on construction. let this: &SequenceString = unsafe { vtable.cast().as_ref() }; @@ -159,6 +163,7 @@ fn seq_drop(vtable: NonNull) { } } +#[inline] fn seq_as_str(vtable: NonNull) -> JsStr<'static> { // SAFETY: This is part of the correct vtable which is validated on construction. let this: &SequenceString = unsafe { vtable.cast().as_ref() }; @@ -170,7 +175,15 @@ fn seq_as_str(vtable: NonNull) -> JsStr<' T::str_ctor(slice) } +#[inline] +fn seq_code_points( + vtable: NonNull, +) -> CodePointsIter<'static> { + CodePointsIter::new(seq_as_str::(vtable)) +} + /// `VTable` function for refcount, need to return an `Option`. +#[inline] #[allow(clippy::unnecessary_wraps)] fn seq_refcount(vtable: NonNull) -> Option { // SAFETY: This is part of the correct vtable which is validated on construction. diff --git a/core/string/src/vtable/slice.rs b/core/string/src/vtable/slice.rs index a468823378e..b2bc76e0008 100644 --- a/core/string/src/vtable/slice.rs +++ b/core/string/src/vtable/slice.rs @@ -1,3 +1,4 @@ +use crate::iter::CodePointsIter; use crate::vtable::JsStringVTable; use crate::{JsStr, JsString, JsStringKind}; use std::cell::Cell; @@ -7,7 +8,7 @@ use std::ptr::NonNull; /// A slice of an existing string. #[repr(C)] pub(crate) struct SliceString { - /// Embedded `VTable` - must be first field for vtable dispatch. + /// Embedded `VTable` - must be the first field for vtable dispatch. vtable: JsStringVTable, // Keep this for refcounting the original string. owned: JsString, @@ -32,6 +33,7 @@ impl SliceString { clone: slice_clone, drop: slice_drop, as_str: slice_as_str, + code_points: slice_code_points, refcount: slice_refcount, len, kind: JsStringKind::Slice, @@ -52,6 +54,7 @@ impl SliceString { } } +#[inline] pub(super) fn slice_clone(vtable: NonNull) -> JsString { // SAFETY: This is part of the correct vtable which is validated on construction. let this: &SliceString = unsafe { vtable.cast().as_ref() }; @@ -63,6 +66,7 @@ pub(super) fn slice_clone(vtable: NonNull) -> JsString { unsafe { JsString::from_ptr(vtable) } } +#[inline] fn slice_drop(vtable: NonNull) { // SAFETY: This is part of the correct vtable which is validated on construction. let this: &SliceString = unsafe { vtable.cast().as_ref() }; @@ -81,6 +85,7 @@ fn slice_drop(vtable: NonNull) { } } +#[inline] fn slice_as_str(vtable: NonNull) -> JsStr<'static> { // SAFETY: This is part of the correct vtable which is validated on construction. let this: &SliceString = unsafe { vtable.cast().as_ref() }; @@ -99,7 +104,13 @@ fn slice_as_str(vtable: NonNull) -> JsStr<'static> { } } +#[inline] +fn slice_code_points(vtable: NonNull) -> CodePointsIter<'static> { + CodePointsIter::new(slice_as_str(vtable)) +} + /// `VTable` function for refcount, need to return an `Option`. +#[inline] #[allow(clippy::unnecessary_wraps)] fn slice_refcount(vtable: NonNull) -> Option { // SAFETY: This is part of the correct vtable which is validated on construction. diff --git a/core/string/src/vtable/static.rs b/core/string/src/vtable/static.rs index 34772c55863..de57b051903 100644 --- a/core/string/src/vtable/static.rs +++ b/core/string/src/vtable/static.rs @@ -1,3 +1,4 @@ +use crate::iter::CodePointsIter; use crate::vtable::JsStringVTable; use crate::{JsStr, JsString, JsStringKind}; use std::hash::{Hash, Hasher}; @@ -7,7 +8,7 @@ use std::ptr::NonNull; #[derive(Debug, Clone, Copy)] #[repr(C)] pub struct StaticString { - /// Embedded `VTable` - must be first field for vtable dispatch. + /// Embedded `VTable` - must be the first field for vtable dispatch. vtable: JsStringVTable, /// The actual string data. pub(crate) str: JsStr<'static>, @@ -22,8 +23,9 @@ impl StaticString { clone: static_clone, drop: static_drop, as_str: static_as_str, - len: str.len(), + code_points: static_code_points, refcount: static_refcount, + len: str.len(), kind: JsStringKind::Static, }, str, @@ -51,22 +53,31 @@ impl std::borrow::Borrow> for &'static StaticString { } } +#[inline] pub(crate) fn static_clone(this: NonNull) -> JsString { - // Static strings don't need refcounting, just copy the pointer. + // Static strings don't need ref counting, just copy the pointer. // SAFETY: validated the string outside this function. unsafe { JsString::from_ptr(this) } } +#[inline] fn static_drop(_ptr: NonNull) { // Static strings don't need cleanup. } +#[inline] fn static_as_str(this: NonNull) -> JsStr<'static> { // SAFETY: validated the string outside this function. let this: &StaticString = unsafe { this.cast().as_ref() }; this.str } +#[inline] +fn static_code_points(this: NonNull) -> CodePointsIter<'static> { + CodePointsIter::new(static_as_str(this)) +} + +#[inline] fn static_refcount(_ptr: NonNull) -> Option { // Static strings don't have refcount. None