From 27bb1623cd048f3cbfc527cc315894803deabba2 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 19 Jan 2026 07:32:09 +0100 Subject: [PATCH 01/10] file.c: Optimize `rb_file_dirname_n` fixed costs - `str_null_check` was performed twice, once by `FilePathStringValue` and a second time by `StringValueCStr`. - `StringValueCStr` was checking for the terminator presence, but we don't care about that. - `FilePathStringValue` calls `rb_str_new_frozen` to ensure `fname` isn't mutated, but that's costly for such a check. Instead we can do it in debug mode only. - `rb_enc_get` is slow because it accepts arbitrary objects, even immediates, so it has to do numerous type checks. Add a much faster `rb_str_enc_get` when we know we're dealing with a string. - `rb_enc_copy` is slow for the same reasons, since we already have the encoding, we can use `rb_enc_str_new` instead. --- benchmark/file_dirname.yml | 5 +++++ file.c | 45 ++++++++++++++++++++++++-------------- internal/string.h | 7 ++++++ string.c | 2 +- 4 files changed, 41 insertions(+), 18 deletions(-) create mode 100644 benchmark/file_dirname.yml diff --git a/benchmark/file_dirname.yml b/benchmark/file_dirname.yml new file mode 100644 index 00000000000000..d5c134ad4b6f40 --- /dev/null +++ b/benchmark/file_dirname.yml @@ -0,0 +1,5 @@ +prelude: | + # frozen_string_literal: true +benchmark: + long: File.dirname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml") + short: File.dirname("foo/bar") diff --git a/file.c b/file.c index 809253fab0d9da..9f4f45e5c67b10 100644 --- a/file.c +++ b/file.c @@ -214,15 +214,16 @@ file_path_convert(VALUE name) return name; } -static rb_encoding * +static void check_path_encoding(VALUE str) { - rb_encoding *enc = rb_enc_get(str); - if (!rb_enc_asciicompat(enc)) { - rb_raise(rb_eEncCompatError, "path name must be ASCII-compatible (%s): %"PRIsVALUE, - rb_enc_name(enc), rb_str_inspect(str)); + if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) { + rb_encoding *enc = rb_str_enc_get(str); + if (!rb_enc_asciicompat(enc)) { + rb_raise(rb_eEncCompatError, "path name must be ASCII-compatible (%s): %"PRIsVALUE, + rb_enc_name(enc), rb_str_inspect(str)); + } } - return enc; } VALUE @@ -250,7 +251,7 @@ rb_get_path_check_convert(VALUE obj) rb_raise(rb_eArgError, "path name contains null byte"); } - return rb_str_new4(obj); + return rb_str_new_frozen(obj); } VALUE @@ -265,6 +266,19 @@ rb_get_path(VALUE obj) return rb_get_path_check_convert(rb_get_path_check_to_string(obj)); } +static inline VALUE +check_path(VALUE obj, const char **cstr) +{ + VALUE str = rb_get_path_check_convert(rb_get_path_check_to_string(obj)); +#if RUBY_DEBUG + str = rb_str_new_frozen(str); +#endif + *cstr = RSTRING_PTR(str); + return str; +} + +#define CheckPath(str, cstr) RB_GC_GUARD(str) = check_path(str, &cstr); + VALUE rb_str_encode_ospath(VALUE path) { @@ -4952,7 +4966,8 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _) if (rb_check_arity(argc, 1, 2) == 2) { fext = argv[1]; StringValue(fext); - enc = check_path_encoding(fext); + check_path_encoding(fext); + enc = rb_str_enc_get(fext); } fname = argv[0]; FilePathStringValue(fname); @@ -5031,10 +5046,9 @@ rb_file_dirname_n(VALUE fname, int n) const char **seps; if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n); - FilePathStringValue(fname); - name = StringValueCStr(fname); + CheckPath(fname, name); end = name + RSTRING_LEN(fname); - enc = rb_enc_get(fname); + enc = rb_str_enc_get(fname); root = skiproot(name, end, enc); #ifdef DOSISH_UNC if (root > name + 1 && isdirsep(*name)) @@ -5077,24 +5091,21 @@ rb_file_dirname_n(VALUE fname, int n) } } if (p == name) { - dirname = rb_str_new(".", 1); - rb_enc_copy(dirname, fname); - return dirname; + return rb_enc_str_new(".", 1, enc); } #ifdef DOSISH_DRIVE_LETTER if (has_drive_letter(name) && isdirsep(*(name + 2))) { const char *top = skiproot(name + 2, end, enc); - dirname = rb_str_new(name, 3); + dirname = rb_enc_str_new(name, 3, enc); rb_str_cat(dirname, top, p - top); } else #endif - dirname = rb_str_new(name, p - name); + dirname = rb_enc_str_new(name, p - name, enc); #ifdef DOSISH_DRIVE_LETTER if (has_drive_letter(name) && root == name + 2 && p - name == 2) rb_str_cat(dirname, ".", 1); #endif - rb_enc_copy(dirname, fname); return dirname; } diff --git a/internal/string.h b/internal/string.h index cd1e8d79296ef4..dd5e20c0c68ed1 100644 --- a/internal/string.h +++ b/internal/string.h @@ -50,6 +50,13 @@ rb_str_enc_fastpath(VALUE str) return rb_str_encindex_fastpath(ENCODING_GET_INLINED(str)); } +static inline rb_encoding * +rb_str_enc_get(VALUE str) +{ + RUBY_ASSERT(RB_TYPE_P(str, T_STRING)); + return rb_enc_from_index(ENCODING_GET(str)); +} + /* string.c */ VALUE rb_str_dup_m(VALUE str); VALUE rb_fstring(VALUE); diff --git a/string.c b/string.c index 1e0b9929ef150c..464eab21463ff3 100644 --- a/string.c +++ b/string.c @@ -2880,7 +2880,7 @@ str_null_check(VALUE str, int *w) int minlen = 1; if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) { - rb_encoding *enc = rb_enc_get(str); + rb_encoding *enc = rb_str_enc_get(str); minlen = rb_enc_mbminlen(enc); if (minlen > 1) { From 11d29d32d270d2a98642858fdca25a5272563995 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 19 Jan 2026 08:43:57 +0100 Subject: [PATCH 02/10] file.c: strrdirsep search from the back of the string for common encodings `strrdirsep` quite innficiently search for the last separator from the front of the string. This is surprising but necessary because in Shift-JS, `0x5c` can be the second byte of some multi-byte characters, as such it's not possible to do a pure ASCII search. And it's even more costly because for each character we need to do expensive checks to handle this possibility. However in the overwhelming majority of cases, paths are encoded in UTF-8 or ASCII, so for these common encodings we can use the more logical and efficient algorithm. ``` compare-ruby: ruby 4.1.0dev (2026-01-17T14:40:03Z master 00a3b71eaf) +PRISM [arm64-darwin25] built-ruby: ruby 4.1.0dev (2026-01-19T07:43:57Z file-dirname-lower.. a8d3535e5b) +PRISM [arm64-darwin25] ``` | |compare-ruby|built-ruby| |:------|-----------:|---------:| |long | 3.974M| 23.674M| | | -| 5.96x| |short | 15.281M| 29.034M| | | -| 1.90x| --- file.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/file.c b/file.c index 9f4f45e5c67b10..7d47fc60a71438 100644 --- a/file.c +++ b/file.c @@ -3693,7 +3693,6 @@ skipprefixroot(const char *path, const char *end, rb_encoding *enc) #endif } -#define strrdirsep rb_enc_path_last_separator char * rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc) { @@ -3712,6 +3711,30 @@ rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc) return last; } +static inline char * +strrdirsep(const char *path, const char *end, rb_encoding *enc) +{ + if (RB_LIKELY(enc == NULL)) { + const char *cursor = end - 1; + + while (isdirsep(cursor[0])) { + cursor--; + } + + while (cursor >= path) { + if (isdirsep(cursor[0])) { + while (cursor > path && isdirsep(cursor[-1])) { + cursor--; + } + return (char *)cursor; + } + cursor--; + } + return NULL; + } + return rb_enc_path_last_separator(path, end, enc); +} + static char * chompdirsep(const char *path, const char *end, rb_encoding *enc) { @@ -5036,6 +5059,15 @@ rb_file_dirname(VALUE fname) return rb_file_dirname_n(fname, 1); } +static inline rb_encoding * +path_enc_get(VALUE str) +{ + if (RB_LIKELY(rb_str_enc_fastpath(str))) { + return NULL; + } + return rb_str_enc_get(str); +} + static VALUE rb_file_dirname_n(VALUE fname, int n) { @@ -5048,7 +5080,7 @@ rb_file_dirname_n(VALUE fname, int n) if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n); CheckPath(fname, name); end = name + RSTRING_LEN(fname); - enc = rb_str_enc_get(fname); + enc = path_enc_get(fname); root = skiproot(name, end, enc); #ifdef DOSISH_UNC if (root > name + 1 && isdirsep(*name)) @@ -5082,7 +5114,12 @@ rb_file_dirname_n(VALUE fname, int n) if (i == n) i = 0; } else { - Inc(p, end, enc); + if (RB_UNLIKELY(enc)) { + Inc(p, end, enc); + } + else { + p++; + } } } p = seps[i]; @@ -5090,18 +5127,19 @@ rb_file_dirname_n(VALUE fname, int n) break; } } + if (p == name) { - return rb_enc_str_new(".", 1, enc); + return rb_enc_str_new(".", 1, rb_str_enc_get(fname)); } #ifdef DOSISH_DRIVE_LETTER if (has_drive_letter(name) && isdirsep(*(name + 2))) { const char *top = skiproot(name + 2, end, enc); - dirname = rb_enc_str_new(name, 3, enc); + dirname = rb_enc_str_new(name, 3, rb_str_enc_get(fname)); rb_str_cat(dirname, top, p - top); } else #endif - dirname = rb_enc_str_new(name, p - name, enc); + dirname = rb_enc_str_new(name, p - name, rb_str_enc_get(fname)); #ifdef DOSISH_DRIVE_LETTER if (has_drive_letter(name) && root == name + 2 && p - name == 2) rb_str_cat(dirname, ".", 1); From 240a58ecc63d07b50ddfb027a9c9b9bc317ffc49 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 19 Jan 2026 08:52:57 +0100 Subject: [PATCH 03/10] file.c: skiproot doesn't need encoding --- file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/file.c b/file.c index 7d47fc60a71438..95bb557aeef7df 100644 --- a/file.c +++ b/file.c @@ -3636,7 +3636,7 @@ not_same_drive(VALUE path, int drive) #endif /* DOSISH_DRIVE_LETTER */ static inline char * -skiproot(const char *path, const char *end, rb_encoding *enc) +skiproot(const char *path, const char *end) { #ifdef DOSISH_DRIVE_LETTER if (path + 2 <= end && has_drive_letter(path)) path += 2; @@ -3689,7 +3689,7 @@ skipprefixroot(const char *path, const char *end, rb_encoding *enc) while (isdirsep(*p)) p++; return p; #else - return skiproot(path, end, enc); + return skiproot(path, end); #endif } @@ -4075,7 +4075,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na rb_enc_associate(result, enc = fs_enc_check(result, fname)); p = pend; } - p = chompdirsep(skiproot(buf, p, enc), p, enc); + p = chompdirsep(skiproot(buf, p), p, enc); s += 2; } } @@ -4100,7 +4100,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na } else #endif /* defined DOSISH || defined __CYGWIN__ */ - p = chompdirsep(skiproot(buf, p, enc), p, enc); + p = chompdirsep(skiproot(buf, p), p, enc); } else { size_t len; @@ -4231,7 +4231,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na BUFCOPY(b, s-b); rb_str_set_len(result, p-buf); } - if (p == skiproot(buf, p + !!*p, enc) - 1) p++; + if (p == skiproot(buf, p + !!*p) - 1) p++; #if USE_NTFS *p = '\0'; @@ -5081,7 +5081,7 @@ rb_file_dirname_n(VALUE fname, int n) CheckPath(fname, name); end = name + RSTRING_LEN(fname); enc = path_enc_get(fname); - root = skiproot(name, end, enc); + root = skiproot(name, end); #ifdef DOSISH_UNC if (root > name + 1 && isdirsep(*name)) root = skipprefix(name = root - 2, end, enc); @@ -5133,7 +5133,7 @@ rb_file_dirname_n(VALUE fname, int n) } #ifdef DOSISH_DRIVE_LETTER if (has_drive_letter(name) && isdirsep(*(name + 2))) { - const char *top = skiproot(name + 2, end, enc); + const char *top = skiproot(name + 2, end); dirname = rb_enc_str_new(name, 3, rb_str_enc_get(fname)); rb_str_cat(dirname, top, p - top); } From 990cf1286a165588c5b2f4d4fa98bb9d45c8fd11 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 19 Jan 2026 08:56:41 +0100 Subject: [PATCH 04/10] file.c: Inc macro now handle NULL encoding --- file.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/file.c b/file.c index 95bb557aeef7df..365df5a292e765 100644 --- a/file.c +++ b/file.c @@ -3571,7 +3571,7 @@ static const char file_alt_separator[] = {FILE_ALT_SEPARATOR, '\0'}; # define isADS(x) 0 #endif -#define Next(p, e, enc) ((p) + rb_enc_mbclen((p), (e), (enc))) +#define Next(p, e, enc) ((p) + ((enc) ? rb_enc_mbclen((p), (e), (enc)) : 1)) #define Inc(p, e, enc) ((p) = Next((p), (e), (enc))) #if defined(DOSISH_UNC) @@ -5114,12 +5114,7 @@ rb_file_dirname_n(VALUE fname, int n) if (i == n) i = 0; } else { - if (RB_UNLIKELY(enc)) { - Inc(p, end, enc); - } - else { - p++; - } + Inc(p, end, enc); } } p = seps[i]; From 2d10f1511806d3ef0c327ef0f89ff0ab8c1e5947 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 19 Jan 2026 09:03:42 +0100 Subject: [PATCH 05/10] file.c: dirname_n also use strrdirsep when n > 1 It's both simpler and faster. | |compare-ruby|built-ruby| |:------|-----------:|---------:| |long | 3.960M| 24.072M| | | -| 6.08x| |short | 15.417M| 29.841M| | | -| 1.94x| |n_4 | 3.858M| 18.415M| | | -| 4.77x| --- benchmark/file_dirname.yml | 1 + file.c | 34 ++++++---------------------------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/benchmark/file_dirname.yml b/benchmark/file_dirname.yml index d5c134ad4b6f40..43a81c937182b8 100644 --- a/benchmark/file_dirname.yml +++ b/benchmark/file_dirname.yml @@ -3,3 +3,4 @@ prelude: | benchmark: long: File.dirname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml") short: File.dirname("foo/bar") + n_4: File.dirname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml", 4) diff --git a/file.c b/file.c index 365df5a292e765..5d442b5bc5c1cd 100644 --- a/file.c +++ b/file.c @@ -5074,8 +5074,6 @@ rb_file_dirname_n(VALUE fname, int n) const char *name, *root, *p, *end; VALUE dirname; rb_encoding *enc; - VALUE sepsv = 0; - const char **seps; if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n); CheckPath(fname, name); @@ -5093,33 +5091,13 @@ rb_file_dirname_n(VALUE fname, int n) p = root; } else { - int i; - switch (n) { - case 0: - p = end; - break; - case 1: - if (!(p = strrdirsep(root, end, enc))) p = root; - break; - default: - seps = ALLOCV_N(const char *, sepsv, n); - for (i = 0; i < n; ++i) seps[i] = root; - i = 0; - for (p = root; p < end; ) { - if (isdirsep(*p)) { - const char *tmp = p++; - while (p < end && isdirsep(*p)) p++; - if (p >= end) break; - seps[i++] = tmp; - if (i == n) i = 0; - } - else { - Inc(p, end, enc); - } + p = end; + while (n) { + if (!(p = strrdirsep(root, p, enc))) { + p = root; + break; } - p = seps[i]; - ALLOCV_END(sepsv); - break; + n--; } } From 6fb50434e325dbaf0ae75ade5b47b26f9671a5d2 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 19 Jan 2026 17:04:15 +0100 Subject: [PATCH 06/10] file.c: Stop passing NULL for encoding `rb_encoding *` is defined as `nonnull` so `if (enc)` is optimized out by the compiler. We have to pass a boolean alongside it to avoid crashes. --- file.c | 124 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 59 deletions(-) diff --git a/file.c b/file.c index 5d442b5bc5c1cd..2829c27f6e7ddc 100644 --- a/file.c +++ b/file.c @@ -3571,8 +3571,8 @@ static const char file_alt_separator[] = {FILE_ALT_SEPARATOR, '\0'}; # define isADS(x) 0 #endif -#define Next(p, e, enc) ((p) + ((enc) ? rb_enc_mbclen((p), (e), (enc)) : 1)) -#define Inc(p, e, enc) ((p) = Next((p), (e), (enc))) +#define Next(p, e, mb_enc, enc) ((p) + ((mb_enc) ? rb_enc_mbclen((p), (e), (enc)) : 1)) +#define Inc(p, e, mb_enc, enc) ((p) = Next((p), (e), (mb_enc), (enc))) #if defined(DOSISH_UNC) #define has_unc(buf) (isdirsep((buf)[0]) && isdirsep((buf)[1])) @@ -3645,31 +3645,37 @@ skiproot(const char *path, const char *end) return (char *)path; } -#define nextdirsep rb_enc_path_next -char * -rb_enc_path_next(const char *s, const char *e, rb_encoding *enc) +static inline char * +enc_path_next(const char *s, const char *e, bool mb_enc, rb_encoding *enc) { while (s < e && !isdirsep(*s)) { - Inc(s, e, enc); + Inc(s, e, mb_enc, enc); } return (char *)s; } +#define nextdirsep rb_enc_path_next +char * +rb_enc_path_next(const char *s, const char *e, rb_encoding *enc) +{ + return enc_path_next(s, e, true, enc); +} + #if defined(DOSISH_UNC) || defined(DOSISH_DRIVE_LETTER) -#define skipprefix rb_enc_path_skip_prefix +#define skipprefix enc_path_skip_prefix #else -#define skipprefix(path, end, enc) (path) +#define skipprefix(path, end, mb_enc, enc) (path) #endif -char * -rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc) +static inline char * +enc_path_skip_prefix(const char *path, const char *end, bool mb_enc, rb_encoding *enc) { #if defined(DOSISH_UNC) || defined(DOSISH_DRIVE_LETTER) #ifdef DOSISH_UNC if (path + 2 <= end && isdirsep(path[0]) && isdirsep(path[1])) { path += 2; while (path < end && isdirsep(*path)) path++; - if ((path = rb_enc_path_next(path, end, enc)) < end && path[0] && path[1] && !isdirsep(path[1])) - path = rb_enc_path_next(path + 1, end, enc); + if ((path = enc_path_next(path, end, mb_enc, enc)) < end && path[0] && path[1] && !isdirsep(path[1])) + path = enc_path_next(path + 1, end, mb_enc, enc); return (char *)path; } #endif @@ -3681,11 +3687,17 @@ rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc) return (char *)path; } +char * +rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc) +{ + return enc_path_skip_prefix(path, end, true, enc); +} + static inline char * skipprefixroot(const char *path, const char *end, rb_encoding *enc) { #if defined(DOSISH_UNC) || defined(DOSISH_DRIVE_LETTER) - char *p = skipprefix(path, end, enc); + char *p = skipprefix(path, end, true, enc); while (isdirsep(*p)) p++; return p; #else @@ -3705,34 +3717,35 @@ rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc) last = (char *)tmp; } else { - Inc(path, end, enc); + Inc(path, end, true, enc); } } return last; } static inline char * -strrdirsep(const char *path, const char *end, rb_encoding *enc) +strrdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc) { - if (RB_LIKELY(enc == NULL)) { - const char *cursor = end - 1; + if (RB_UNLIKELY(mb_enc)) { + return rb_enc_path_last_separator(path, end, enc); + } - while (isdirsep(cursor[0])) { - cursor--; - } + const char *cursor = end - 1; - while (cursor >= path) { - if (isdirsep(cursor[0])) { - while (cursor > path && isdirsep(cursor[-1])) { - cursor--; - } - return (char *)cursor; + while (isdirsep(cursor[0])) { + cursor--; + } + + while (cursor >= path) { + if (isdirsep(cursor[0])) { + while (cursor > path && isdirsep(cursor[-1])) { + cursor--; } - cursor--; + return (char *)cursor; } - return NULL; + cursor--; } - return rb_enc_path_last_separator(path, end, enc); + return NULL; } static char * @@ -3745,7 +3758,7 @@ chompdirsep(const char *path, const char *end, rb_encoding *enc) if (path >= end) return (char *)last; } else { - Inc(path, end, enc); + Inc(path, end, true, enc); } } return (char *)path; @@ -3790,7 +3803,7 @@ ntfs_tail(const char *path, const char *end, rb_encoding *enc) if (isADS(*path)) path++; } else { - Inc(path, end, enc); + Inc(path, end, true, enc); } } return (char *)path; @@ -3852,7 +3865,7 @@ copy_home_path(VALUE result, const char *dir) rb_enc_associate_index(result, encidx); #if defined DOSISH || defined __CYGWIN__ enc = rb_enc_from_index(encidx); - for (bend = (p = buf) + dirlen; p < bend; Inc(p, bend, enc)) { + for (bend = (p = buf) + dirlen; p < bend; Inc(p, bend, true, enc)) { if (*p == '\\') { *p = '/'; } @@ -4096,7 +4109,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na if (isdirsep(*s)) { /* specified full path, but not drive letter nor UNC */ /* we need to get the drive letter or UNC share name */ - p = skipprefix(buf, p, enc); + p = skipprefix(buf, p, true, enc); } else #endif /* defined DOSISH || defined __CYGWIN__ */ @@ -4124,7 +4137,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na rb_str_set_len(result, p-buf+1); BUFCHECK(bdiff + 1 >= buflen); p[1] = 0; - root = skipprefix(buf, p+1, enc); + root = skipprefix(buf, p+1, true, enc); b = s; while (*s) { @@ -4140,7 +4153,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na /* We must go back to the parent */ char *n; *p = '\0'; - if (!(n = strrdirsep(root, p, enc))) { + if (!(n = strrdirsep(root, p, true, enc))) { *p = '/'; } else { @@ -4203,7 +4216,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na } } #endif /* __APPLE__ */ - Inc(s, fend, enc); + Inc(s, fend, true, enc); break; } } @@ -4503,7 +4516,7 @@ realpath_rec(long *prefixlenp, VALUE *resolvedp, const char *unresolved, VALUE f if (*prefixlenp < RSTRING_LEN(*resolvedp)) { const char *resolved_str = RSTRING_PTR(*resolvedp); const char *resolved_names = resolved_str + *prefixlenp; - const char *lastsep = strrdirsep(resolved_names, resolved_str + RSTRING_LEN(*resolvedp), enc); + const char *lastsep = strrdirsep(resolved_names, resolved_str + RSTRING_LEN(*resolvedp), true, enc); long len = lastsep ? lastsep - resolved_names : 0; rb_str_resize(*resolvedp, *prefixlenp + len); } @@ -4653,7 +4666,7 @@ rb_check_realpath_emulate(VALUE basedir, VALUE path, rb_encoding *origenc, enum if (*prefixptr == FILE_ALT_SEPARATOR) { *prefixptr = '/'; } - Inc(prefixptr, pend, enc); + Inc(prefixptr, pend, true, enc); } #endif @@ -4907,7 +4920,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin long f = 0, n = -1; end = name + (alllen ? (size_t)*alllen : strlen(name)); - name = skipprefix(name, end, enc); + name = skipprefix(name, end, true, enc); #if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC root = name; #endif @@ -4934,7 +4947,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin #endif /* defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC */ } else { - if (!(p = strrdirsep(name, end, enc))) { + if (!(p = strrdirsep(name, end, true, enc))) { p = name; } else { @@ -4946,7 +4959,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin n = chompdirsep(p, end, enc) - p; #endif for (q = p; q - p < n && *q == '.'; q++); - for (e = 0; q - p < n; Inc(q, end, enc)) { + for (e = 0; q - p < n; Inc(q, end, true, enc)) { if (*q == '.') e = q; } if (e) f = e - p; @@ -5059,30 +5072,23 @@ rb_file_dirname(VALUE fname) return rb_file_dirname_n(fname, 1); } -static inline rb_encoding * -path_enc_get(VALUE str) -{ - if (RB_LIKELY(rb_str_enc_fastpath(str))) { - return NULL; - } - return rb_str_enc_get(str); -} - static VALUE rb_file_dirname_n(VALUE fname, int n) { const char *name, *root, *p, *end; VALUE dirname; - rb_encoding *enc; if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n); CheckPath(fname, name); end = name + RSTRING_LEN(fname); - enc = path_enc_get(fname); + + bool mb_enc = !rb_str_enc_fastpath(fname); + rb_encoding *enc = rb_str_enc_get(fname); + root = skiproot(name, end); #ifdef DOSISH_UNC if (root > name + 1 && isdirsep(*name)) - root = skipprefix(name = root - 2, end, enc); + root = skipprefix(name = root - 2, end, mb_enc, enc); #else if (root > name + 1) name = root - 1; @@ -5093,7 +5099,7 @@ rb_file_dirname_n(VALUE fname, int n) else { p = end; while (n) { - if (!(p = strrdirsep(root, p, enc))) { + if (!(p = strrdirsep(root, p, mb_enc, enc))) { p = root; break; } @@ -5102,17 +5108,17 @@ rb_file_dirname_n(VALUE fname, int n) } if (p == name) { - return rb_enc_str_new(".", 1, rb_str_enc_get(fname)); + return rb_enc_str_new(".", 1, enc); } #ifdef DOSISH_DRIVE_LETTER if (has_drive_letter(name) && isdirsep(*(name + 2))) { const char *top = skiproot(name + 2, end); - dirname = rb_enc_str_new(name, 3, rb_str_enc_get(fname)); + dirname = rb_enc_str_new(name, 3, enc); rb_str_cat(dirname, top, p - top); } else #endif - dirname = rb_enc_str_new(name, p - name, rb_str_enc_get(fname)); + dirname = rb_enc_str_new(name, p - name, enc); #ifdef DOSISH_DRIVE_LETTER if (has_drive_letter(name) && root == name + 2 && p - name == 2) rb_str_cat(dirname, ".", 1); @@ -5137,7 +5143,7 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc) { const char *p, *e, *end = name + (len ? *len : (long)strlen(name)); - p = strrdirsep(name, end, enc); /* get the last path component */ + p = strrdirsep(name, end, true, enc); /* get the last path component */ if (!p) p = name; else @@ -5170,7 +5176,7 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc) #endif else if (isdirsep(*p)) break; - Inc(p, end, enc); + Inc(p, end, true, enc); } if (len) { From 53fe9933fd6c62f3a7f1ed2908a99510c2f27adc Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 20 Jan 2026 08:56:17 +0100 Subject: [PATCH 07/10] Optimize `File.extname` for common encodings Similar optimizations to the ones performed in GH-15907. - Skip the expensive multi-byte encoding handling for the common encodings that are known to be safe. - Use `CheckPath` to save on copying the argument and only scan it for NULL bytes once. - Create the return string with rb_enc_str_new instead of rb_str_subseq as it's going to be a very small string anyway. This could be optimized a little bit further by searching for both `.` and `dirsep` in one pass, ``` compare-ruby: ruby 4.1.0dev (2026-01-19T03:51:30Z master 631bf19b37) +PRISM [arm64-darwin25] built-ruby: ruby 4.1.0dev (2026-01-20T07:33:42Z master 6fb50434e3) +PRISM [arm64-darwin25] ``` | |compare-ruby|built-ruby| |:----------|-----------:|---------:| |long | 3.606M| 22.229M| | | -| 6.17x| |long_name | 2.254M| 13.416M| | | -| 5.95x| |short | 16.488M| 29.969M| | | -| 1.82x| --- benchmark/file_extname.yml | 6 ++++ file.c | 61 +++++++++++++++++++++----------------- 2 files changed, 40 insertions(+), 27 deletions(-) create mode 100644 benchmark/file_extname.yml diff --git a/benchmark/file_extname.yml b/benchmark/file_extname.yml new file mode 100644 index 00000000000000..fb16e558405530 --- /dev/null +++ b/benchmark/file_extname.yml @@ -0,0 +1,6 @@ +prelude: | + # frozen_string_literal: true +benchmark: + long: File.extname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml") + long_name: File.extname("Users_george_src_github.com_ruby_ruby_benchmark_file_dirname.yml") + short: File.extname("foo/bar") diff --git a/file.c b/file.c index 2829c27f6e7ddc..a98bb9728e00db 100644 --- a/file.c +++ b/file.c @@ -5126,24 +5126,12 @@ rb_file_dirname_n(VALUE fname, int n) return dirname; } -/* - * accept a String, and return the pointer of the extension. - * if len is passed, set the length of extension to it. - * returned pointer is in ``name'' or NULL. - * returns *len - * no dot NULL 0 - * dotfile top 0 - * end with dot dot 1 - * .ext dot len of .ext - * .ext:stream dot len of .ext without :stream (NTFS only) - * - */ -const char * -ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc) +static inline const char * +enc_find_extname(const char *name, long *len, bool mb_enc, rb_encoding *enc) { const char *p, *e, *end = name + (len ? *len : (long)strlen(name)); - p = strrdirsep(name, end, true, enc); /* get the last path component */ + p = strrdirsep(name, end, mb_enc, enc); /* get the last path component */ if (!p) p = name; else @@ -5176,7 +5164,7 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc) #endif else if (isdirsep(*p)) break; - Inc(p, end, true, enc); + Inc(p, end, mb_enc, enc); } if (len) { @@ -5191,6 +5179,24 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc) return e; } +/* + * accept a String, and return the pointer of the extension. + * if len is passed, set the length of extension to it. + * returned pointer is in ``name'' or NULL. + * returns *len + * no dot NULL 0 + * dotfile top 0 + * end with dot dot 1 + * .ext dot len of .ext + * .ext:stream dot len of .ext without :stream (NTFS only) + * + */ +const char * +ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc) +{ + return enc_find_extname(name, len, true, enc); +} + /* * call-seq: * File.extname(path) -> string @@ -5220,18 +5226,19 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc) static VALUE rb_file_s_extname(VALUE klass, VALUE fname) { - const char *name, *e; - long len; - VALUE extname; + const char *name; + CheckPath(fname, name); + long len = RSTRING_LEN(fname); - FilePathStringValue(fname); - name = StringValueCStr(fname); - len = RSTRING_LEN(fname); - e = ruby_enc_find_extname(name, &len, rb_enc_get(fname)); - if (len < 1) - return rb_str_new(0, 0); - extname = rb_str_subseq(fname, e - name, len); /* keep the dot, too! */ - return extname; + if (len < 1) { + return rb_enc_str_new(0, 0, rb_str_enc_get(fname)); + } + + bool mb_enc = !rb_str_enc_fastpath(fname); + rb_encoding *enc = rb_str_enc_get(fname); + + const char *ext = enc_find_extname(name, &len, mb_enc, enc); + return rb_enc_str_new(ext, len, enc); } /* From 35a7b5159f39de2cac848c072674e5350cc41aa4 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Tue, 20 Jan 2026 08:53:39 +0100 Subject: [PATCH 08/10] [ruby/prism] Add Ripper :on_sp events for Prism.lex_compat and Prism::Translation::Ripper * Handle line continuations. * Handle space at the end of file in LexCompat. https://github.com/ruby/prism/commit/32bd13eb7d Co-authored-by: Earlopain <14981592+Earlopain@users.noreply.github.com> --- lib/prism.rb | 8 +- lib/prism/lex_compat.rb | 101 ++++++++++++++++++++-- lib/prism/lex_ripper.rb | 2 - test/prism/fixtures/bom_leading_space.txt | 1 + test/prism/fixtures/bom_spaces.txt | 1 + test/prism/ruby/ripper_test.rb | 12 ++- 6 files changed, 106 insertions(+), 19 deletions(-) create mode 100644 test/prism/fixtures/bom_leading_space.txt create mode 100644 test/prism/fixtures/bom_spaces.txt diff --git a/lib/prism.rb b/lib/prism.rb index d809557fce101f..dab3420377214f 100644 --- a/lib/prism.rb +++ b/lib/prism.rb @@ -61,8 +61,7 @@ def initialize(version) # Prism::lex_compat(source, **options) -> LexCompat::Result # # Returns a parse result whose value is an array of tokens that closely - # resembles the return value of Ripper::lex. The main difference is that the - # `:on_sp` token is not emitted. + # resembles the return value of Ripper::lex. # # For supported options, see Prism::parse. def self.lex_compat(source, **options) @@ -72,9 +71,8 @@ def self.lex_compat(source, **options) # :call-seq: # Prism::lex_ripper(source) -> Array # - # This lexes with the Ripper lex. It drops any space events but otherwise - # returns the same tokens. Raises SyntaxError if the syntax in source is - # invalid. + # This wraps the result of Ripper.lex. It produces almost exactly the + # same tokens. Raises SyntaxError if the syntax in source is invalid. def self.lex_ripper(source) LexRipper.new(source).result # steep:ignore end diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb index f7b9a0effc969d..597e63c73e73b7 100644 --- a/lib/prism/lex_compat.rb +++ b/lib/prism/lex_compat.rb @@ -226,7 +226,7 @@ def state end # Tokens where state should be ignored - # used for :on_comment, :on_heredoc_end, :on_embexpr_end + # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end class IgnoreStateToken < Token def ==(other) # :nodoc: self[0...-1] == other[0...-1] @@ -611,10 +611,10 @@ def self.build(opening) BOM_FLUSHED = RUBY_VERSION >= "3.3.0" private_constant :BOM_FLUSHED - attr_reader :source, :options + attr_reader :options - def initialize(source, **options) - @source = source + def initialize(code, **options) + @code = code @options = options end @@ -624,12 +624,14 @@ def result state = :default heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]] - result = Prism.lex(source, **options) + result = Prism.lex(@code, **options) + source = result.source result_value = result.value previous_state = nil #: State? last_heredoc_end = nil #: Integer? + eof_token = nil - bom = source.byteslice(0..2) == "\xEF\xBB\xBF" + bom = source.slice(0, 3) == "\xEF\xBB\xBF" result_value.each_with_index do |(token, lex_state), index| lineno = token.location.start_line @@ -741,6 +743,7 @@ def result Token.new([[lineno, column], event, value, lex_state]) when :on_eof + eof_token = token previous_token = result_value[index - 1][0] # If we're at the end of the file and the previous token was a @@ -763,7 +766,7 @@ def result end_offset += 3 end - tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state]) + tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state]) end end @@ -857,7 +860,89 @@ def result # We sort by location to compare against Ripper's output tokens.sort_by!(&:location) - Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source)) + # Add :on_sp tokens + tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token) + + Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source) + end + + def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token) + new_tokens = [] + + prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG) + prev_token_end = bom ? 3 : 0 + + tokens.each do |token| + line, column = token.location + start_offset = source.line_to_byte_offset(line) + column + # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset + start_offset += 3 if line == 1 && bom + + if start_offset > prev_token_end + sp_value = source.slice(prev_token_end, start_offset - prev_token_end) + sp_line = source.line(prev_token_end) + sp_column = source.column(prev_token_end) + # Ripper reports columns on line 1 without counting the BOM + sp_column -= 3 if sp_line == 1 && bom + continuation_index = sp_value.byteindex("\\") + + # ripper emits up to three :on_sp tokens when line continuations are used + if continuation_index + next_whitespace_index = continuation_index + 1 + next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r" + next_whitespace_index += 1 + first_whitespace = sp_value[0...continuation_index] + continuation = sp_value[continuation_index...next_whitespace_index] + second_whitespace = sp_value[next_whitespace_index..] + + new_tokens << IgnoreStateToken.new([ + [sp_line, sp_column], + :on_sp, + first_whitespace, + prev_token_state + ]) unless first_whitespace.empty? + + new_tokens << IgnoreStateToken.new([ + [sp_line, sp_column + continuation_index], + :on_sp, + continuation, + prev_token_state + ]) + + new_tokens << IgnoreStateToken.new([ + [sp_line + 1, 0], + :on_sp, + second_whitespace, + prev_token_state + ]) unless second_whitespace.empty? + else + new_tokens << IgnoreStateToken.new([ + [sp_line, sp_column], + :on_sp, + sp_value, + prev_token_state + ]) + end + end + + new_tokens << token + prev_token_state = token.state + prev_token_end = start_offset + token.value.bytesize + end + + unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl + end_offset = eof_token.location.end_offset + if prev_token_end < end_offset + new_tokens << IgnoreStateToken.new([ + [source.line(prev_token_end), source.column(prev_token_end)], + :on_sp, + source.slice(prev_token_end, end_offset - prev_token_end), + prev_token_state + ]) + end + end + + new_tokens end end diff --git a/lib/prism/lex_ripper.rb b/lib/prism/lex_ripper.rb index 4b5c3b77fd6112..2054cf55ac0c70 100644 --- a/lib/prism/lex_ripper.rb +++ b/lib/prism/lex_ripper.rb @@ -19,8 +19,6 @@ def result lex(source).each do |token| case token[1] - when :on_sp - # skip when :on_tstring_content if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@")) previous[2] << token[2] diff --git a/test/prism/fixtures/bom_leading_space.txt b/test/prism/fixtures/bom_leading_space.txt new file mode 100644 index 00000000000000..48d3ee50ea47b0 --- /dev/null +++ b/test/prism/fixtures/bom_leading_space.txt @@ -0,0 +1 @@ + p (42) diff --git a/test/prism/fixtures/bom_spaces.txt b/test/prism/fixtures/bom_spaces.txt new file mode 100644 index 00000000000000..c18ad4c21ad7e7 --- /dev/null +++ b/test/prism/fixtures/bom_spaces.txt @@ -0,0 +1 @@ +p ( 42 ) diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb index 2a0504c19f35f0..280abd94ea3e64 100644 --- a/test/prism/ruby/ripper_test.rb +++ b/test/prism/ruby/ripper_test.rb @@ -39,6 +39,8 @@ class RipperTest < TestCase # Skip these tests that we haven't implemented yet. omitted_sexp_raw = [ + "bom_leading_space.txt", + "bom_spaces.txt", "dos_endings.txt", "heredocs_with_fake_newlines.txt", "heredocs_with_ignored_newlines.txt", @@ -92,7 +94,7 @@ def test_lexer assert_equal(expected, lexer.parse[0].to_a) assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a) - assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event)) + assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event)) assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) } end @@ -121,15 +123,17 @@ def assert_ripper_sexp_raw(source) def assert_ripper_lex(source) prism = Translation::Ripper.lex(source) ripper = Ripper.lex(source) - ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp - ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order + + # Prism emits tokens by their order in the code, not in parse order + ripper.sort_by! { |elem| elem[0] } [prism.size, ripper.size].max.times do |i| expected = ripper[i] actual = prism[i] + # Since tokens related to heredocs are not emitted in the same order, # the state also doesn't line up. - if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end + if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end expected[3] = actual[3] = nil end From 58f1127b51cf4fbb1f334f8701a041f40701dca2 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Tue, 20 Jan 2026 19:10:16 +0900 Subject: [PATCH 09/10] Revert "[ruby/prism] Add Ripper :on_sp events for Prism.lex_compat and Prism::Translation::Ripper" This reverts commit 35a7b5159f39de2cac848c072674e5350cc41aa4. This broke syntax_suggest. https://github.com/ruby/ruby/actions/runs/21167011751/job/60874111912 --- lib/prism.rb | 8 +- lib/prism/lex_compat.rb | 101 ++-------------------- lib/prism/lex_ripper.rb | 2 + test/prism/fixtures/bom_leading_space.txt | 1 - test/prism/fixtures/bom_spaces.txt | 1 - test/prism/ruby/ripper_test.rb | 12 +-- 6 files changed, 19 insertions(+), 106 deletions(-) delete mode 100644 test/prism/fixtures/bom_leading_space.txt delete mode 100644 test/prism/fixtures/bom_spaces.txt diff --git a/lib/prism.rb b/lib/prism.rb index dab3420377214f..d809557fce101f 100644 --- a/lib/prism.rb +++ b/lib/prism.rb @@ -61,7 +61,8 @@ def initialize(version) # Prism::lex_compat(source, **options) -> LexCompat::Result # # Returns a parse result whose value is an array of tokens that closely - # resembles the return value of Ripper::lex. + # resembles the return value of Ripper::lex. The main difference is that the + # `:on_sp` token is not emitted. # # For supported options, see Prism::parse. def self.lex_compat(source, **options) @@ -71,8 +72,9 @@ def self.lex_compat(source, **options) # :call-seq: # Prism::lex_ripper(source) -> Array # - # This wraps the result of Ripper.lex. It produces almost exactly the - # same tokens. Raises SyntaxError if the syntax in source is invalid. + # This lexes with the Ripper lex. It drops any space events but otherwise + # returns the same tokens. Raises SyntaxError if the syntax in source is + # invalid. def self.lex_ripper(source) LexRipper.new(source).result # steep:ignore end diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb index 597e63c73e73b7..f7b9a0effc969d 100644 --- a/lib/prism/lex_compat.rb +++ b/lib/prism/lex_compat.rb @@ -226,7 +226,7 @@ def state end # Tokens where state should be ignored - # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end + # used for :on_comment, :on_heredoc_end, :on_embexpr_end class IgnoreStateToken < Token def ==(other) # :nodoc: self[0...-1] == other[0...-1] @@ -611,10 +611,10 @@ def self.build(opening) BOM_FLUSHED = RUBY_VERSION >= "3.3.0" private_constant :BOM_FLUSHED - attr_reader :options + attr_reader :source, :options - def initialize(code, **options) - @code = code + def initialize(source, **options) + @source = source @options = options end @@ -624,14 +624,12 @@ def result state = :default heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]] - result = Prism.lex(@code, **options) - source = result.source + result = Prism.lex(source, **options) result_value = result.value previous_state = nil #: State? last_heredoc_end = nil #: Integer? - eof_token = nil - bom = source.slice(0, 3) == "\xEF\xBB\xBF" + bom = source.byteslice(0..2) == "\xEF\xBB\xBF" result_value.each_with_index do |(token, lex_state), index| lineno = token.location.start_line @@ -743,7 +741,6 @@ def result Token.new([[lineno, column], event, value, lex_state]) when :on_eof - eof_token = token previous_token = result_value[index - 1][0] # If we're at the end of the file and the previous token was a @@ -766,7 +763,7 @@ def result end_offset += 3 end - tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state]) + tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state]) end end @@ -860,89 +857,7 @@ def result # We sort by location to compare against Ripper's output tokens.sort_by!(&:location) - # Add :on_sp tokens - tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token) - - Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source) - end - - def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token) - new_tokens = [] - - prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG) - prev_token_end = bom ? 3 : 0 - - tokens.each do |token| - line, column = token.location - start_offset = source.line_to_byte_offset(line) + column - # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset - start_offset += 3 if line == 1 && bom - - if start_offset > prev_token_end - sp_value = source.slice(prev_token_end, start_offset - prev_token_end) - sp_line = source.line(prev_token_end) - sp_column = source.column(prev_token_end) - # Ripper reports columns on line 1 without counting the BOM - sp_column -= 3 if sp_line == 1 && bom - continuation_index = sp_value.byteindex("\\") - - # ripper emits up to three :on_sp tokens when line continuations are used - if continuation_index - next_whitespace_index = continuation_index + 1 - next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r" - next_whitespace_index += 1 - first_whitespace = sp_value[0...continuation_index] - continuation = sp_value[continuation_index...next_whitespace_index] - second_whitespace = sp_value[next_whitespace_index..] - - new_tokens << IgnoreStateToken.new([ - [sp_line, sp_column], - :on_sp, - first_whitespace, - prev_token_state - ]) unless first_whitespace.empty? - - new_tokens << IgnoreStateToken.new([ - [sp_line, sp_column + continuation_index], - :on_sp, - continuation, - prev_token_state - ]) - - new_tokens << IgnoreStateToken.new([ - [sp_line + 1, 0], - :on_sp, - second_whitespace, - prev_token_state - ]) unless second_whitespace.empty? - else - new_tokens << IgnoreStateToken.new([ - [sp_line, sp_column], - :on_sp, - sp_value, - prev_token_state - ]) - end - end - - new_tokens << token - prev_token_state = token.state - prev_token_end = start_offset + token.value.bytesize - end - - unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl - end_offset = eof_token.location.end_offset - if prev_token_end < end_offset - new_tokens << IgnoreStateToken.new([ - [source.line(prev_token_end), source.column(prev_token_end)], - :on_sp, - source.slice(prev_token_end, end_offset - prev_token_end), - prev_token_state - ]) - end - end - - new_tokens + Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source)) end end diff --git a/lib/prism/lex_ripper.rb b/lib/prism/lex_ripper.rb index 2054cf55ac0c70..4b5c3b77fd6112 100644 --- a/lib/prism/lex_ripper.rb +++ b/lib/prism/lex_ripper.rb @@ -19,6 +19,8 @@ def result lex(source).each do |token| case token[1] + when :on_sp + # skip when :on_tstring_content if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@")) previous[2] << token[2] diff --git a/test/prism/fixtures/bom_leading_space.txt b/test/prism/fixtures/bom_leading_space.txt deleted file mode 100644 index 48d3ee50ea47b0..00000000000000 --- a/test/prism/fixtures/bom_leading_space.txt +++ /dev/null @@ -1 +0,0 @@ - p (42) diff --git a/test/prism/fixtures/bom_spaces.txt b/test/prism/fixtures/bom_spaces.txt deleted file mode 100644 index c18ad4c21ad7e7..00000000000000 --- a/test/prism/fixtures/bom_spaces.txt +++ /dev/null @@ -1 +0,0 @@ -p ( 42 ) diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb index 280abd94ea3e64..2a0504c19f35f0 100644 --- a/test/prism/ruby/ripper_test.rb +++ b/test/prism/ruby/ripper_test.rb @@ -39,8 +39,6 @@ class RipperTest < TestCase # Skip these tests that we haven't implemented yet. omitted_sexp_raw = [ - "bom_leading_space.txt", - "bom_spaces.txt", "dos_endings.txt", "heredocs_with_fake_newlines.txt", "heredocs_with_ignored_newlines.txt", @@ -94,7 +92,7 @@ def test_lexer assert_equal(expected, lexer.parse[0].to_a) assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a) - assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event)) + assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event)) assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) } end @@ -123,17 +121,15 @@ def assert_ripper_sexp_raw(source) def assert_ripper_lex(source) prism = Translation::Ripper.lex(source) ripper = Ripper.lex(source) - - # Prism emits tokens by their order in the code, not in parse order - ripper.sort_by! { |elem| elem[0] } + ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp + ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order [prism.size, ripper.size].max.times do |i| expected = ripper[i] actual = prism[i] - # Since tokens related to heredocs are not emitted in the same order, # the state also doesn't line up. - if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end + if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end expected[3] = actual[3] = nil end From 300927b4bb1b41e9e848c063f2ca6109423a1729 Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada Date: Tue, 20 Jan 2026 18:37:24 +0900 Subject: [PATCH 10/10] [Bug #21845] Rebuild gem extensions at RUBY_ABI_VERSION change As `TARGET_SO_DIR_TIMESTAMP` contains `ruby_version`, after bumping `RUBY_ABI_VERSION` it should not be existing. Usually such outdated files will be removed by `make outdate-bundled-gems` automatically invoked by `make up`. --- ext/extmk.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/ext/extmk.rb b/ext/extmk.rb index 8f847f4f3ab619..578e1cfa01d004 100755 --- a/ext/extmk.rb +++ b/ext/extmk.rb @@ -592,6 +592,7 @@ def create_makefile(*args, &block) build_complete = $(TARGET_GEM_DIR)/gem.build_complete install-so: build_complete clean-so:: clean-build_complete +$(build_complete) $(OBJS): $(TARGET_SO_DIR_TIMESTAMP) build_complete: $(build_complete) $(build_complete): $(TARGET_SO)