From 27bb1623cd048f3cbfc527cc315894803deabba2 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Mon, 19 Jan 2026 07:32:09 +0100
Subject: [PATCH 01/10] file.c: Optimize `rb_file_dirname_n` fixed costs

- `str_null_check` was performed twice, once by `FilePathStringValue`
  and a second time by `StringValueCStr`.
- `StringValueCStr` was checking for the terminator presence, but we
  don't care about that.
- `FilePathStringValue` calls `rb_str_new_frozen` to ensure `fname`
  isn't mutated, but that's costly for such a check. Instead we
  can do it in debug mode only.
- `rb_enc_get` is slow because it accepts arbitrary objects, even immediates,
  so it has to do numerous type checks. Add a much faster `rb_str_enc_get`
  when we know we're dealing with a string.
- `rb_enc_copy` is slow for the same reasons, since we already have the
  encoding, we can use `rb_enc_str_new` instead.
---
 benchmark/file_dirname.yml |  5 +++++
 file.c                     | 45 ++++++++++++++++++++++++--------------
 internal/string.h          |  7 ++++++
 string.c                   |  2 +-
 4 files changed, 41 insertions(+), 18 deletions(-)
 create mode 100644 benchmark/file_dirname.yml

diff --git a/benchmark/file_dirname.yml b/benchmark/file_dirname.yml
new file mode 100644
index 00000000000000..d5c134ad4b6f40
--- /dev/null
+++ b/benchmark/file_dirname.yml
@@ -0,0 +1,5 @@
+prelude: |
+  # frozen_string_literal: true
+benchmark:
+  long: File.dirname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml")
+  short: File.dirname("foo/bar")
diff --git a/file.c b/file.c
index 809253fab0d9da..9f4f45e5c67b10 100644
--- a/file.c
+++ b/file.c
@@ -214,15 +214,16 @@ file_path_convert(VALUE name)
     return name;
 }
 
-static rb_encoding *
+static void
 check_path_encoding(VALUE str)
 {
-    rb_encoding *enc = rb_enc_get(str);
-    if (!rb_enc_asciicompat(enc)) {
-        rb_raise(rb_eEncCompatError, "path name must be ASCII-compatible (%s): %"PRIsVALUE,
-                 rb_enc_name(enc), rb_str_inspect(str));
+    if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
+        rb_encoding *enc = rb_str_enc_get(str);
+        if (!rb_enc_asciicompat(enc)) {
+            rb_raise(rb_eEncCompatError, "path name must be ASCII-compatible (%s): %"PRIsVALUE,
+                     rb_enc_name(enc), rb_str_inspect(str));
+        }
     }
-    return enc;
 }
 
 VALUE
@@ -250,7 +251,7 @@ rb_get_path_check_convert(VALUE obj)
         rb_raise(rb_eArgError, "path name contains null byte");
     }
 
-    return rb_str_new4(obj);
+    return rb_str_new_frozen(obj);
 }
 
 VALUE
@@ -265,6 +266,19 @@ rb_get_path(VALUE obj)
     return rb_get_path_check_convert(rb_get_path_check_to_string(obj));
 }
 
+static inline VALUE
+check_path(VALUE obj, const char **cstr)
+{
+    VALUE str = rb_get_path_check_convert(rb_get_path_check_to_string(obj));
+#if RUBY_DEBUG
+    str = rb_str_new_frozen(str);
+#endif
+    *cstr = RSTRING_PTR(str);
+    return str;
+}
+
+#define CheckPath(str, cstr) RB_GC_GUARD(str) = check_path(str, &cstr);
+
 VALUE
 rb_str_encode_ospath(VALUE path)
 {
@@ -4952,7 +4966,8 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _)
     if (rb_check_arity(argc, 1, 2) == 2) {
         fext = argv[1];
         StringValue(fext);
-        enc = check_path_encoding(fext);
+        check_path_encoding(fext);
+        enc = rb_str_enc_get(fext);
     }
     fname = argv[0];
     FilePathStringValue(fname);
@@ -5031,10 +5046,9 @@ rb_file_dirname_n(VALUE fname, int n)
     const char **seps;
 
     if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n);
-    FilePathStringValue(fname);
-    name = StringValueCStr(fname);
+    CheckPath(fname, name);
     end = name + RSTRING_LEN(fname);
-    enc = rb_enc_get(fname);
+    enc = rb_str_enc_get(fname);
     root = skiproot(name, end, enc);
 #ifdef DOSISH_UNC
     if (root > name + 1 && isdirsep(*name))
@@ -5077,24 +5091,21 @@ rb_file_dirname_n(VALUE fname, int n)
         }
     }
     if (p == name) {
-        dirname = rb_str_new(".", 1);
-        rb_enc_copy(dirname, fname);
-        return dirname;
+        return rb_enc_str_new(".", 1, enc);
     }
 #ifdef DOSISH_DRIVE_LETTER
     if (has_drive_letter(name) && isdirsep(*(name + 2))) {
         const char *top = skiproot(name + 2, end, enc);
-        dirname = rb_str_new(name, 3);
+        dirname = rb_enc_str_new(name, 3, enc);
         rb_str_cat(dirname, top, p - top);
     }
     else
 #endif
-    dirname = rb_str_new(name, p - name);
+    dirname = rb_enc_str_new(name, p - name, enc);
 #ifdef DOSISH_DRIVE_LETTER
     if (has_drive_letter(name) && root == name + 2 && p - name == 2)
         rb_str_cat(dirname, ".", 1);
 #endif
-    rb_enc_copy(dirname, fname);
     return dirname;
 }
 
diff --git a/internal/string.h b/internal/string.h
index cd1e8d79296ef4..dd5e20c0c68ed1 100644
--- a/internal/string.h
+++ b/internal/string.h
@@ -50,6 +50,13 @@ rb_str_enc_fastpath(VALUE str)
     return rb_str_encindex_fastpath(ENCODING_GET_INLINED(str));
 }
 
+static inline rb_encoding *
+rb_str_enc_get(VALUE str)
+{
+    RUBY_ASSERT(RB_TYPE_P(str, T_STRING));
+    return rb_enc_from_index(ENCODING_GET(str));
+}
+
 /* string.c */
 VALUE rb_str_dup_m(VALUE str);
 VALUE rb_fstring(VALUE);
diff --git a/string.c b/string.c
index 1e0b9929ef150c..464eab21463ff3 100644
--- a/string.c
+++ b/string.c
@@ -2880,7 +2880,7 @@ str_null_check(VALUE str, int *w)
     int minlen = 1;
 
     if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
-        rb_encoding *enc = rb_enc_get(str);
+        rb_encoding *enc = rb_str_enc_get(str);
         minlen = rb_enc_mbminlen(enc);
 
         if (minlen > 1) {

From 11d29d32d270d2a98642858fdca25a5272563995 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Mon, 19 Jan 2026 08:43:57 +0100
Subject: [PATCH 02/10] file.c: strrdirsep search from the back of the string
 for common encodings

`strrdirsep` quite innficiently search for the last separator from the front
of the string.

This is surprising but necessary because in Shift-JS, `0x5c` can
be the second byte of some multi-byte characters, as such it's
not possible to do a pure ASCII search. And it's even more costly
because for each character we need to do expensive checks to
handle this possibility.

However in the overwhelming majority of cases, paths are encoded
in UTF-8 or ASCII, so for these common encodings we can use the
more logical and efficient algorithm.

```
compare-ruby: ruby 4.1.0dev (2026-01-17T14:40:03Z master 00a3b71eaf) +PRISM [arm64-darwin25]
built-ruby: ruby 4.1.0dev (2026-01-19T07:43:57Z file-dirname-lower.. a8d3535e5b) +PRISM [arm64-darwin25]
```

|       |compare-ruby|built-ruby|
|:------|-----------:|---------:|
|long   |      3.974M|   23.674M|
|       |           -|     5.96x|
|short  |     15.281M|   29.034M|
|       |           -|     1.90x|
---
 file.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/file.c b/file.c
index 9f4f45e5c67b10..7d47fc60a71438 100644
--- a/file.c
+++ b/file.c
@@ -3693,7 +3693,6 @@ skipprefixroot(const char *path, const char *end, rb_encoding *enc)
 #endif
 }
 
-#define strrdirsep rb_enc_path_last_separator
 char *
 rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc)
 {
@@ -3712,6 +3711,30 @@ rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc)
     return last;
 }
 
+static inline char *
+strrdirsep(const char *path, const char *end, rb_encoding *enc)
+{
+    if (RB_LIKELY(enc == NULL)) {
+        const char *cursor = end - 1;
+
+        while (isdirsep(cursor[0])) {
+            cursor--;
+        }
+
+        while (cursor >= path) {
+            if (isdirsep(cursor[0])) {
+                while (cursor > path && isdirsep(cursor[-1])) {
+                    cursor--;
+                }
+                return (char *)cursor;
+            }
+            cursor--;
+        }
+        return NULL;
+    }
+    return rb_enc_path_last_separator(path, end, enc);
+}
+
 static char *
 chompdirsep(const char *path, const char *end, rb_encoding *enc)
 {
@@ -5036,6 +5059,15 @@ rb_file_dirname(VALUE fname)
     return rb_file_dirname_n(fname, 1);
 }
 
+static inline rb_encoding *
+path_enc_get(VALUE str)
+{
+    if (RB_LIKELY(rb_str_enc_fastpath(str))) {
+        return NULL;
+    }
+    return rb_str_enc_get(str);
+}
+
 static VALUE
 rb_file_dirname_n(VALUE fname, int n)
 {
@@ -5048,7 +5080,7 @@ rb_file_dirname_n(VALUE fname, int n)
     if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n);
     CheckPath(fname, name);
     end = name + RSTRING_LEN(fname);
-    enc = rb_str_enc_get(fname);
+    enc = path_enc_get(fname);
     root = skiproot(name, end, enc);
 #ifdef DOSISH_UNC
     if (root > name + 1 && isdirsep(*name))
@@ -5082,7 +5114,12 @@ rb_file_dirname_n(VALUE fname, int n)
                     if (i == n) i = 0;
                 }
                 else {
-                    Inc(p, end, enc);
+                    if (RB_UNLIKELY(enc)) {
+                        Inc(p, end, enc);
+                    }
+                    else {
+                        p++;
+                    }
                 }
             }
             p = seps[i];
@@ -5090,18 +5127,19 @@ rb_file_dirname_n(VALUE fname, int n)
             break;
         }
     }
+
     if (p == name) {
-        return rb_enc_str_new(".", 1, enc);
+        return rb_enc_str_new(".", 1, rb_str_enc_get(fname));
     }
 #ifdef DOSISH_DRIVE_LETTER
     if (has_drive_letter(name) && isdirsep(*(name + 2))) {
         const char *top = skiproot(name + 2, end, enc);
-        dirname = rb_enc_str_new(name, 3, enc);
+        dirname = rb_enc_str_new(name, 3, rb_str_enc_get(fname));
         rb_str_cat(dirname, top, p - top);
     }
     else
 #endif
-    dirname = rb_enc_str_new(name, p - name, enc);
+    dirname = rb_enc_str_new(name, p - name, rb_str_enc_get(fname));
 #ifdef DOSISH_DRIVE_LETTER
     if (has_drive_letter(name) && root == name + 2 && p - name == 2)
         rb_str_cat(dirname, ".", 1);

From 240a58ecc63d07b50ddfb027a9c9b9bc317ffc49 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Mon, 19 Jan 2026 08:52:57 +0100
Subject: [PATCH 03/10] file.c: skiproot doesn't need encoding

---
 file.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/file.c b/file.c
index 7d47fc60a71438..95bb557aeef7df 100644
--- a/file.c
+++ b/file.c
@@ -3636,7 +3636,7 @@ not_same_drive(VALUE path, int drive)
 #endif /* DOSISH_DRIVE_LETTER */
 
 static inline char *
-skiproot(const char *path, const char *end, rb_encoding *enc)
+skiproot(const char *path, const char *end)
 {
 #ifdef DOSISH_DRIVE_LETTER
     if (path + 2 <= end && has_drive_letter(path)) path += 2;
@@ -3689,7 +3689,7 @@ skipprefixroot(const char *path, const char *end, rb_encoding *enc)
     while (isdirsep(*p)) p++;
     return p;
 #else
-    return skiproot(path, end, enc);
+    return skiproot(path, end);
 #endif
 }
 
@@ -4075,7 +4075,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
                 rb_enc_associate(result, enc = fs_enc_check(result, fname));
                 p = pend;
             }
-            p = chompdirsep(skiproot(buf, p, enc), p, enc);
+            p = chompdirsep(skiproot(buf, p), p, enc);
             s += 2;
         }
     }
@@ -4100,7 +4100,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
         }
         else
 #endif /* defined DOSISH || defined __CYGWIN__ */
-            p = chompdirsep(skiproot(buf, p, enc), p, enc);
+            p = chompdirsep(skiproot(buf, p), p, enc);
     }
     else {
         size_t len;
@@ -4231,7 +4231,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
         BUFCOPY(b, s-b);
         rb_str_set_len(result, p-buf);
     }
-    if (p == skiproot(buf, p + !!*p, enc) - 1) p++;
+    if (p == skiproot(buf, p + !!*p) - 1) p++;
 
 #if USE_NTFS
     *p = '\0';
@@ -5081,7 +5081,7 @@ rb_file_dirname_n(VALUE fname, int n)
     CheckPath(fname, name);
     end = name + RSTRING_LEN(fname);
     enc = path_enc_get(fname);
-    root = skiproot(name, end, enc);
+    root = skiproot(name, end);
 #ifdef DOSISH_UNC
     if (root > name + 1 && isdirsep(*name))
         root = skipprefix(name = root - 2, end, enc);
@@ -5133,7 +5133,7 @@ rb_file_dirname_n(VALUE fname, int n)
     }
 #ifdef DOSISH_DRIVE_LETTER
     if (has_drive_letter(name) && isdirsep(*(name + 2))) {
-        const char *top = skiproot(name + 2, end, enc);
+        const char *top = skiproot(name + 2, end);
         dirname = rb_enc_str_new(name, 3, rb_str_enc_get(fname));
         rb_str_cat(dirname, top, p - top);
     }

From 990cf1286a165588c5b2f4d4fa98bb9d45c8fd11 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Mon, 19 Jan 2026 08:56:41 +0100
Subject: [PATCH 04/10] file.c: Inc macro now handle NULL encoding

---
 file.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/file.c b/file.c
index 95bb557aeef7df..365df5a292e765 100644
--- a/file.c
+++ b/file.c
@@ -3571,7 +3571,7 @@ static const char file_alt_separator[] = {FILE_ALT_SEPARATOR, '\0'};
 # define isADS(x) 0
 #endif
 
-#define Next(p, e, enc) ((p) + rb_enc_mbclen((p), (e), (enc)))
+#define Next(p, e, enc) ((p) + ((enc) ? rb_enc_mbclen((p), (e), (enc)) : 1))
 #define Inc(p, e, enc) ((p) = Next((p), (e), (enc)))
 
 #if defined(DOSISH_UNC)
@@ -5114,12 +5114,7 @@ rb_file_dirname_n(VALUE fname, int n)
                     if (i == n) i = 0;
                 }
                 else {
-                    if (RB_UNLIKELY(enc)) {
-                        Inc(p, end, enc);
-                    }
-                    else {
-                        p++;
-                    }
+                    Inc(p, end, enc);
                 }
             }
             p = seps[i];

From 2d10f1511806d3ef0c327ef0f89ff0ab8c1e5947 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Mon, 19 Jan 2026 09:03:42 +0100
Subject: [PATCH 05/10] file.c: dirname_n also use strrdirsep when n > 1

It's both simpler and faster.

|       |compare-ruby|built-ruby|
|:------|-----------:|---------:|
|long   |      3.960M|   24.072M|
|       |           -|     6.08x|
|short  |     15.417M|   29.841M|
|       |           -|     1.94x|
|n_4    |      3.858M|   18.415M|
|       |           -|     4.77x|
---
 benchmark/file_dirname.yml |  1 +
 file.c                     | 34 ++++++----------------------------
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/benchmark/file_dirname.yml b/benchmark/file_dirname.yml
index d5c134ad4b6f40..43a81c937182b8 100644
--- a/benchmark/file_dirname.yml
+++ b/benchmark/file_dirname.yml
@@ -3,3 +3,4 @@ prelude: |
 benchmark:
   long: File.dirname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml")
   short: File.dirname("foo/bar")
+  n_4: File.dirname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml", 4)
diff --git a/file.c b/file.c
index 365df5a292e765..5d442b5bc5c1cd 100644
--- a/file.c
+++ b/file.c
@@ -5074,8 +5074,6 @@ rb_file_dirname_n(VALUE fname, int n)
     const char *name, *root, *p, *end;
     VALUE dirname;
     rb_encoding *enc;
-    VALUE sepsv = 0;
-    const char **seps;
 
     if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n);
     CheckPath(fname, name);
@@ -5093,33 +5091,13 @@ rb_file_dirname_n(VALUE fname, int n)
         p = root;
     }
     else {
-        int i;
-        switch (n) {
-          case 0:
-            p = end;
-            break;
-          case 1:
-            if (!(p = strrdirsep(root, end, enc))) p = root;
-            break;
-          default:
-            seps = ALLOCV_N(const char *, sepsv, n);
-            for (i = 0; i < n; ++i) seps[i] = root;
-            i = 0;
-            for (p = root; p < end; ) {
-                if (isdirsep(*p)) {
-                    const char *tmp = p++;
-                    while (p < end && isdirsep(*p)) p++;
-                    if (p >= end) break;
-                    seps[i++] = tmp;
-                    if (i == n) i = 0;
-                }
-                else {
-                    Inc(p, end, enc);
-                }
+        p = end;
+        while (n) {
+            if (!(p = strrdirsep(root, p, enc))) {
+                p = root;
+                break;
             }
-            p = seps[i];
-            ALLOCV_END(sepsv);
-            break;
+            n--;
         }
     }
 

From 6fb50434e325dbaf0ae75ade5b47b26f9671a5d2 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Mon, 19 Jan 2026 17:04:15 +0100
Subject: [PATCH 06/10] file.c: Stop passing NULL for encoding

`rb_encoding *` is defined as `nonnull` so `if (enc)` is optimized
out by the compiler. We have to pass a boolean alongside it to
avoid crashes.
---
 file.c | 124 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 65 insertions(+), 59 deletions(-)

diff --git a/file.c b/file.c
index 5d442b5bc5c1cd..2829c27f6e7ddc 100644
--- a/file.c
+++ b/file.c
@@ -3571,8 +3571,8 @@ static const char file_alt_separator[] = {FILE_ALT_SEPARATOR, '\0'};
 # define isADS(x) 0
 #endif
 
-#define Next(p, e, enc) ((p) + ((enc) ? rb_enc_mbclen((p), (e), (enc)) : 1))
-#define Inc(p, e, enc) ((p) = Next((p), (e), (enc)))
+#define Next(p, e, mb_enc, enc) ((p) + ((mb_enc) ? rb_enc_mbclen((p), (e), (enc)) : 1))
+#define Inc(p, e, mb_enc, enc) ((p) = Next((p), (e), (mb_enc), (enc)))
 
 #if defined(DOSISH_UNC)
 #define has_unc(buf) (isdirsep((buf)[0]) && isdirsep((buf)[1]))
@@ -3645,31 +3645,37 @@ skiproot(const char *path, const char *end)
     return (char *)path;
 }
 
-#define nextdirsep rb_enc_path_next
-char *
-rb_enc_path_next(const char *s, const char *e, rb_encoding *enc)
+static inline char *
+enc_path_next(const char *s, const char *e, bool mb_enc, rb_encoding *enc)
 {
     while (s < e && !isdirsep(*s)) {
-        Inc(s, e, enc);
+        Inc(s, e, mb_enc, enc);
     }
     return (char *)s;
 }
 
+#define nextdirsep rb_enc_path_next
+char *
+rb_enc_path_next(const char *s, const char *e, rb_encoding *enc)
+{
+    return enc_path_next(s, e, true, enc);
+}
+
 #if defined(DOSISH_UNC) || defined(DOSISH_DRIVE_LETTER)
-#define skipprefix rb_enc_path_skip_prefix
+#define skipprefix enc_path_skip_prefix
 #else
-#define skipprefix(path, end, enc) (path)
+#define skipprefix(path, end, mb_enc, enc) (path)
 #endif
-char *
-rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc)
+static inline char *
+enc_path_skip_prefix(const char *path, const char *end, bool mb_enc, rb_encoding *enc)
 {
 #if defined(DOSISH_UNC) || defined(DOSISH_DRIVE_LETTER)
 #ifdef DOSISH_UNC
     if (path + 2 <= end && isdirsep(path[0]) && isdirsep(path[1])) {
         path += 2;
         while (path < end && isdirsep(*path)) path++;
-        if ((path = rb_enc_path_next(path, end, enc)) < end && path[0] && path[1] && !isdirsep(path[1]))
-            path = rb_enc_path_next(path + 1, end, enc);
+        if ((path = enc_path_next(path, end, mb_enc, enc)) < end && path[0] && path[1] && !isdirsep(path[1]))
+            path = enc_path_next(path + 1, end, mb_enc, enc);
         return (char *)path;
     }
 #endif
@@ -3681,11 +3687,17 @@ rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc)
     return (char *)path;
 }
 
+char *
+rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc)
+{
+    return enc_path_skip_prefix(path, end, true, enc);
+}
+
 static inline char *
 skipprefixroot(const char *path, const char *end, rb_encoding *enc)
 {
 #if defined(DOSISH_UNC) || defined(DOSISH_DRIVE_LETTER)
-    char *p = skipprefix(path, end, enc);
+    char *p = skipprefix(path, end, true, enc);
     while (isdirsep(*p)) p++;
     return p;
 #else
@@ -3705,34 +3717,35 @@ rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc)
             last = (char *)tmp;
         }
         else {
-            Inc(path, end, enc);
+            Inc(path, end, true, enc);
         }
     }
     return last;
 }
 
 static inline char *
-strrdirsep(const char *path, const char *end, rb_encoding *enc)
+strrdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc)
 {
-    if (RB_LIKELY(enc == NULL)) {
-        const char *cursor = end - 1;
+    if (RB_UNLIKELY(mb_enc)) {
+        return rb_enc_path_last_separator(path, end, enc);
+    }
 
-        while (isdirsep(cursor[0])) {
-            cursor--;
-        }
+    const char *cursor = end - 1;
 
-        while (cursor >= path) {
-            if (isdirsep(cursor[0])) {
-                while (cursor > path && isdirsep(cursor[-1])) {
-                    cursor--;
-                }
-                return (char *)cursor;
+    while (isdirsep(cursor[0])) {
+        cursor--;
+    }
+
+    while (cursor >= path) {
+        if (isdirsep(cursor[0])) {
+            while (cursor > path && isdirsep(cursor[-1])) {
+                cursor--;
             }
-            cursor--;
+            return (char *)cursor;
         }
-        return NULL;
+        cursor--;
     }
-    return rb_enc_path_last_separator(path, end, enc);
+    return NULL;
 }
 
 static char *
@@ -3745,7 +3758,7 @@ chompdirsep(const char *path, const char *end, rb_encoding *enc)
             if (path >= end) return (char *)last;
         }
         else {
-            Inc(path, end, enc);
+            Inc(path, end, true, enc);
         }
     }
     return (char *)path;
@@ -3790,7 +3803,7 @@ ntfs_tail(const char *path, const char *end, rb_encoding *enc)
             if (isADS(*path)) path++;
         }
         else {
-            Inc(path, end, enc);
+            Inc(path, end, true, enc);
         }
     }
     return (char *)path;
@@ -3852,7 +3865,7 @@ copy_home_path(VALUE result, const char *dir)
     rb_enc_associate_index(result, encidx);
 #if defined DOSISH || defined __CYGWIN__
     enc = rb_enc_from_index(encidx);
-    for (bend = (p = buf) + dirlen; p < bend; Inc(p, bend, enc)) {
+    for (bend = (p = buf) + dirlen; p < bend; Inc(p, bend, true, enc)) {
         if (*p == '\\') {
             *p = '/';
         }
@@ -4096,7 +4109,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
         if (isdirsep(*s)) {
             /* specified full path, but not drive letter nor UNC */
             /* we need to get the drive letter or UNC share name */
-            p = skipprefix(buf, p, enc);
+            p = skipprefix(buf, p, true, enc);
         }
         else
 #endif /* defined DOSISH || defined __CYGWIN__ */
@@ -4124,7 +4137,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
     rb_str_set_len(result, p-buf+1);
     BUFCHECK(bdiff + 1 >= buflen);
     p[1] = 0;
-    root = skipprefix(buf, p+1, enc);
+    root = skipprefix(buf, p+1, true, enc);
 
     b = s;
     while (*s) {
@@ -4140,7 +4153,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
                         /* We must go back to the parent */
                         char *n;
                         *p = '\0';
-                        if (!(n = strrdirsep(root, p, enc))) {
+                        if (!(n = strrdirsep(root, p, true, enc))) {
                             *p = '/';
                         }
                         else {
@@ -4203,7 +4216,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
                 }
             }
 #endif /* __APPLE__ */
-            Inc(s, fend, enc);
+            Inc(s, fend, true, enc);
             break;
         }
     }
@@ -4503,7 +4516,7 @@ realpath_rec(long *prefixlenp, VALUE *resolvedp, const char *unresolved, VALUE f
             if (*prefixlenp < RSTRING_LEN(*resolvedp)) {
                 const char *resolved_str = RSTRING_PTR(*resolvedp);
                 const char *resolved_names = resolved_str + *prefixlenp;
-                const char *lastsep = strrdirsep(resolved_names, resolved_str + RSTRING_LEN(*resolvedp), enc);
+                const char *lastsep = strrdirsep(resolved_names, resolved_str + RSTRING_LEN(*resolvedp), true, enc);
                 long len = lastsep ? lastsep - resolved_names : 0;
                 rb_str_resize(*resolvedp, *prefixlenp + len);
             }
@@ -4653,7 +4666,7 @@ rb_check_realpath_emulate(VALUE basedir, VALUE path, rb_encoding *origenc, enum
         if (*prefixptr == FILE_ALT_SEPARATOR) {
             *prefixptr = '/';
         }
-        Inc(prefixptr, pend, enc);
+        Inc(prefixptr, pend, true, enc);
     }
 #endif
 
@@ -4907,7 +4920,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
     long f = 0, n = -1;
 
     end = name + (alllen ? (size_t)*alllen : strlen(name));
-    name = skipprefix(name, end, enc);
+    name = skipprefix(name, end, true, enc);
 #if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC
     root = name;
 #endif
@@ -4934,7 +4947,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
 #endif /* defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC */
     }
     else {
-        if (!(p = strrdirsep(name, end, enc))) {
+        if (!(p = strrdirsep(name, end, true, enc))) {
             p = name;
         }
         else {
@@ -4946,7 +4959,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
         n = chompdirsep(p, end, enc) - p;
 #endif
         for (q = p; q - p < n && *q == '.'; q++);
-        for (e = 0; q - p < n; Inc(q, end, enc)) {
+        for (e = 0; q - p < n; Inc(q, end, true, enc)) {
             if (*q == '.') e = q;
         }
         if (e) f = e - p;
@@ -5059,30 +5072,23 @@ rb_file_dirname(VALUE fname)
     return rb_file_dirname_n(fname, 1);
 }
 
-static inline rb_encoding *
-path_enc_get(VALUE str)
-{
-    if (RB_LIKELY(rb_str_enc_fastpath(str))) {
-        return NULL;
-    }
-    return rb_str_enc_get(str);
-}
-
 static VALUE
 rb_file_dirname_n(VALUE fname, int n)
 {
     const char *name, *root, *p, *end;
     VALUE dirname;
-    rb_encoding *enc;
 
     if (n < 0) rb_raise(rb_eArgError, "negative level: %d", n);
     CheckPath(fname, name);
     end = name + RSTRING_LEN(fname);
-    enc = path_enc_get(fname);
+
+    bool mb_enc = !rb_str_enc_fastpath(fname);
+    rb_encoding *enc = rb_str_enc_get(fname);
+
     root = skiproot(name, end);
 #ifdef DOSISH_UNC
     if (root > name + 1 && isdirsep(*name))
-        root = skipprefix(name = root - 2, end, enc);
+        root = skipprefix(name = root - 2, end, mb_enc, enc);
 #else
     if (root > name + 1)
         name = root - 1;
@@ -5093,7 +5099,7 @@ rb_file_dirname_n(VALUE fname, int n)
     else {
         p = end;
         while (n) {
-            if (!(p = strrdirsep(root, p, enc))) {
+            if (!(p = strrdirsep(root, p, mb_enc, enc))) {
                 p = root;
                 break;
             }
@@ -5102,17 +5108,17 @@ rb_file_dirname_n(VALUE fname, int n)
     }
 
     if (p == name) {
-        return rb_enc_str_new(".", 1, rb_str_enc_get(fname));
+        return rb_enc_str_new(".", 1, enc);
     }
 #ifdef DOSISH_DRIVE_LETTER
     if (has_drive_letter(name) && isdirsep(*(name + 2))) {
         const char *top = skiproot(name + 2, end);
-        dirname = rb_enc_str_new(name, 3, rb_str_enc_get(fname));
+        dirname = rb_enc_str_new(name, 3, enc);
         rb_str_cat(dirname, top, p - top);
     }
     else
 #endif
-    dirname = rb_enc_str_new(name, p - name, rb_str_enc_get(fname));
+    dirname = rb_enc_str_new(name, p - name, enc);
 #ifdef DOSISH_DRIVE_LETTER
     if (has_drive_letter(name) && root == name + 2 && p - name == 2)
         rb_str_cat(dirname, ".", 1);
@@ -5137,7 +5143,7 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc)
 {
     const char *p, *e, *end = name + (len ? *len : (long)strlen(name));
 
-    p = strrdirsep(name, end, enc);	/* get the last path component */
+    p = strrdirsep(name, end, true, enc);	/* get the last path component */
     if (!p)
         p = name;
     else
@@ -5170,7 +5176,7 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc)
 #endif
         else if (isdirsep(*p))
             break;
-        Inc(p, end, enc);
+        Inc(p, end, true, enc);
     }
 
     if (len) {

From 53fe9933fd6c62f3a7f1ed2908a99510c2f27adc Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Tue, 20 Jan 2026 08:56:17 +0100
Subject: [PATCH 07/10] Optimize `File.extname` for common encodings

Similar optimizations to the ones performed in GH-15907.

- Skip the expensive multi-byte encoding handling for the common
  encodings that are known to be safe.
- Use `CheckPath` to save on copying the argument and only scan it for
  NULL bytes once.
- Create the return string with rb_enc_str_new instead of rb_str_subseq
  as it's going to be a very small string anyway.

This could be optimized a little bit further by searching for both `.` and `dirsep`
in one pass,

```
compare-ruby: ruby 4.1.0dev (2026-01-19T03:51:30Z master 631bf19b37) +PRISM [arm64-darwin25]
built-ruby: ruby 4.1.0dev (2026-01-20T07:33:42Z master 6fb50434e3) +PRISM [arm64-darwin25]
```

|           |compare-ruby|built-ruby|
|:----------|-----------:|---------:|
|long       |      3.606M|   22.229M|
|           |           -|     6.17x|
|long_name  |      2.254M|   13.416M|
|           |           -|     5.95x|
|short      |     16.488M|   29.969M|
|           |           -|     1.82x|
---
 benchmark/file_extname.yml |  6 ++++
 file.c                     | 61 +++++++++++++++++++++-----------------
 2 files changed, 40 insertions(+), 27 deletions(-)
 create mode 100644 benchmark/file_extname.yml

diff --git a/benchmark/file_extname.yml b/benchmark/file_extname.yml
new file mode 100644
index 00000000000000..fb16e558405530
--- /dev/null
+++ b/benchmark/file_extname.yml
@@ -0,0 +1,6 @@
+prelude: |
+  # frozen_string_literal: true
+benchmark:
+  long: File.extname("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml")
+  long_name: File.extname("Users_george_src_github.com_ruby_ruby_benchmark_file_dirname.yml")
+  short: File.extname("foo/bar")
diff --git a/file.c b/file.c
index 2829c27f6e7ddc..a98bb9728e00db 100644
--- a/file.c
+++ b/file.c
@@ -5126,24 +5126,12 @@ rb_file_dirname_n(VALUE fname, int n)
     return dirname;
 }
 
-/*
- * accept a String, and return the pointer of the extension.
- * if len is passed, set the length of extension to it.
- * returned pointer is in ``name'' or NULL.
- *                 returns   *len
- *   no dot        NULL      0
- *   dotfile       top       0
- *   end with dot  dot       1
- *   .ext          dot       len of .ext
- *   .ext:stream   dot       len of .ext without :stream (NTFS only)
- *
- */
-const char *
-ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc)
+static inline const char *
+enc_find_extname(const char *name, long *len, bool mb_enc, rb_encoding *enc)
 {
     const char *p, *e, *end = name + (len ? *len : (long)strlen(name));
 
-    p = strrdirsep(name, end, true, enc);	/* get the last path component */
+    p = strrdirsep(name, end, mb_enc, enc);	/* get the last path component */
     if (!p)
         p = name;
     else
@@ -5176,7 +5164,7 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc)
 #endif
         else if (isdirsep(*p))
             break;
-        Inc(p, end, true, enc);
+        Inc(p, end, mb_enc, enc);
     }
 
     if (len) {
@@ -5191,6 +5179,24 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc)
     return e;
 }
 
+/*
+ * accept a String, and return the pointer of the extension.
+ * if len is passed, set the length of extension to it.
+ * returned pointer is in ``name'' or NULL.
+ *                 returns   *len
+ *   no dot        NULL      0
+ *   dotfile       top       0
+ *   end with dot  dot       1
+ *   .ext          dot       len of .ext
+ *   .ext:stream   dot       len of .ext without :stream (NTFS only)
+ *
+ */
+const char *
+ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc)
+{
+    return enc_find_extname(name, len, true, enc);
+}
+
 /*
  *  call-seq:
  *     File.extname(path)  ->  string
@@ -5220,18 +5226,19 @@ ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc)
 static VALUE
 rb_file_s_extname(VALUE klass, VALUE fname)
 {
-    const char *name, *e;
-    long len;
-    VALUE extname;
+    const char *name;
+    CheckPath(fname, name);
+    long len = RSTRING_LEN(fname);
 
-    FilePathStringValue(fname);
-    name = StringValueCStr(fname);
-    len = RSTRING_LEN(fname);
-    e = ruby_enc_find_extname(name, &len, rb_enc_get(fname));
-    if (len < 1)
-        return rb_str_new(0, 0);
-    extname = rb_str_subseq(fname, e - name, len); /* keep the dot, too! */
-    return extname;
+    if (len < 1) {
+        return rb_enc_str_new(0, 0, rb_str_enc_get(fname));
+    }
+
+    bool mb_enc = !rb_str_enc_fastpath(fname);
+    rb_encoding *enc = rb_str_enc_get(fname);
+
+    const char *ext = enc_find_extname(name, &len, mb_enc, enc);
+    return rb_enc_str_new(ext, len, enc);
 }
 
 /*

From 35a7b5159f39de2cac848c072674e5350cc41aa4 Mon Sep 17 00:00:00 2001
From: Benoit Daloze <eregontp@gmail.com>
Date: Tue, 20 Jan 2026 08:53:39 +0100
Subject: [PATCH 08/10] [ruby/prism] Add Ripper :on_sp events for
 Prism.lex_compat and Prism::Translation::Ripper

* Handle line continuations.
* Handle space at the end of file in LexCompat.

https://github.com/ruby/prism/commit/32bd13eb7d

Co-authored-by: Earlopain <14981592+Earlopain@users.noreply.github.com>
---
 lib/prism.rb                              |   8 +-
 lib/prism/lex_compat.rb                   | 101 ++++++++++++++++++++--
 lib/prism/lex_ripper.rb                   |   2 -
 test/prism/fixtures/bom_leading_space.txt |   1 +
 test/prism/fixtures/bom_spaces.txt        |   1 +
 test/prism/ruby/ripper_test.rb            |  12 ++-
 6 files changed, 106 insertions(+), 19 deletions(-)
 create mode 100644 test/prism/fixtures/bom_leading_space.txt
 create mode 100644 test/prism/fixtures/bom_spaces.txt

diff --git a/lib/prism.rb b/lib/prism.rb
index d809557fce101f..dab3420377214f 100644
--- a/lib/prism.rb
+++ b/lib/prism.rb
@@ -61,8 +61,7 @@ def initialize(version)
   #   Prism::lex_compat(source, **options) -> LexCompat::Result
   #
   # Returns a parse result whose value is an array of tokens that closely
-  # resembles the return value of Ripper::lex. The main difference is that the
-  # `:on_sp` token is not emitted.
+  # resembles the return value of Ripper::lex.
   #
   # For supported options, see Prism::parse.
   def self.lex_compat(source, **options)
@@ -72,9 +71,8 @@ def self.lex_compat(source, **options)
   # :call-seq:
   #   Prism::lex_ripper(source) -> Array
   #
-  # This lexes with the Ripper lex. It drops any space events but otherwise
-  # returns the same tokens. Raises SyntaxError if the syntax in source is
-  # invalid.
+  # This wraps the result of Ripper.lex. It produces almost exactly the
+  # same tokens. Raises SyntaxError if the syntax in source is invalid.
   def self.lex_ripper(source)
     LexRipper.new(source).result # steep:ignore
   end
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
index f7b9a0effc969d..597e63c73e73b7 100644
--- a/lib/prism/lex_compat.rb
+++ b/lib/prism/lex_compat.rb
@@ -226,7 +226,7 @@ def state
     end
 
     # Tokens where state should be ignored
-    # used for :on_comment, :on_heredoc_end, :on_embexpr_end
+    # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
     class IgnoreStateToken < Token
       def ==(other) # :nodoc:
         self[0...-1] == other[0...-1]
@@ -611,10 +611,10 @@ def self.build(opening)
     BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
     private_constant :BOM_FLUSHED
 
-    attr_reader :source, :options
+    attr_reader :options
 
-    def initialize(source, **options)
-      @source = source
+    def initialize(code, **options)
+      @code = code
       @options = options
     end
 
@@ -624,12 +624,14 @@ def result
       state = :default
       heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
 
-      result = Prism.lex(source, **options)
+      result = Prism.lex(@code, **options)
+      source = result.source
       result_value = result.value
       previous_state = nil #: State?
       last_heredoc_end = nil #: Integer?
+      eof_token = nil
 
-      bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
+      bom = source.slice(0, 3) == "\xEF\xBB\xBF"
 
       result_value.each_with_index do |(token, lex_state), index|
         lineno = token.location.start_line
@@ -741,6 +743,7 @@ def result
 
             Token.new([[lineno, column], event, value, lex_state])
           when :on_eof
+            eof_token = token
             previous_token = result_value[index - 1][0]
 
             # If we're at the end of the file and the previous token was a
@@ -763,7 +766,7 @@ def result
                   end_offset += 3
                 end
 
-                tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
+                tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
               end
             end
 
@@ -857,7 +860,89 @@ def result
       # We sort by location to compare against Ripper's output
       tokens.sort_by!(&:location)
 
-      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
+      # Add :on_sp tokens
+      tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)
+
+      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
+    end
+
+    def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
+      new_tokens = []
+
+      prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG)
+      prev_token_end = bom ? 3 : 0
+
+      tokens.each do |token|
+        line, column = token.location
+        start_offset = source.line_to_byte_offset(line) + column
+        # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset
+        start_offset += 3 if line == 1 && bom
+
+        if start_offset > prev_token_end
+          sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
+          sp_line = source.line(prev_token_end)
+          sp_column = source.column(prev_token_end)
+          # Ripper reports columns on line 1 without counting the BOM
+          sp_column -= 3 if sp_line == 1 && bom
+          continuation_index = sp_value.byteindex("\\")
+
+          # ripper emits up to three :on_sp tokens when line continuations are used
+          if continuation_index
+            next_whitespace_index = continuation_index + 1
+            next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
+            next_whitespace_index += 1
+            first_whitespace = sp_value[0...continuation_index]
+            continuation = sp_value[continuation_index...next_whitespace_index]
+            second_whitespace = sp_value[next_whitespace_index..]
+
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column],
+              :on_sp,
+              first_whitespace,
+              prev_token_state
+            ]) unless first_whitespace.empty?
+
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column + continuation_index],
+              :on_sp,
+              continuation,
+              prev_token_state
+            ])
+
+            new_tokens << IgnoreStateToken.new([
+              [sp_line + 1, 0],
+              :on_sp,
+              second_whitespace,
+              prev_token_state
+            ]) unless second_whitespace.empty?
+          else
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column],
+              :on_sp,
+              sp_value,
+              prev_token_state
+            ])
+          end
+        end
+
+        new_tokens << token
+        prev_token_state = token.state
+        prev_token_end = start_offset + token.value.bytesize
+      end
+
+      unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
+        end_offset = eof_token.location.end_offset
+        if prev_token_end < end_offset
+          new_tokens << IgnoreStateToken.new([
+            [source.line(prev_token_end), source.column(prev_token_end)],
+            :on_sp,
+            source.slice(prev_token_end, end_offset - prev_token_end),
+            prev_token_state
+          ])
+        end
+      end
+
+      new_tokens
     end
   end
 
diff --git a/lib/prism/lex_ripper.rb b/lib/prism/lex_ripper.rb
index 4b5c3b77fd6112..2054cf55ac0c70 100644
--- a/lib/prism/lex_ripper.rb
+++ b/lib/prism/lex_ripper.rb
@@ -19,8 +19,6 @@ def result
 
       lex(source).each do |token|
         case token[1]
-        when :on_sp
-          # skip
         when :on_tstring_content
           if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
             previous[2] << token[2]
diff --git a/test/prism/fixtures/bom_leading_space.txt b/test/prism/fixtures/bom_leading_space.txt
new file mode 100644
index 00000000000000..48d3ee50ea47b0
--- /dev/null
+++ b/test/prism/fixtures/bom_leading_space.txt
@@ -0,0 +1 @@
+﻿ p (42)
diff --git a/test/prism/fixtures/bom_spaces.txt b/test/prism/fixtures/bom_spaces.txt
new file mode 100644
index 00000000000000..c18ad4c21ad7e7
--- /dev/null
+++ b/test/prism/fixtures/bom_spaces.txt
@@ -0,0 +1 @@
+﻿p ( 42 )
diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb
index 2a0504c19f35f0..280abd94ea3e64 100644
--- a/test/prism/ruby/ripper_test.rb
+++ b/test/prism/ruby/ripper_test.rb
@@ -39,6 +39,8 @@ class RipperTest < TestCase
 
     # Skip these tests that we haven't implemented yet.
     omitted_sexp_raw = [
+      "bom_leading_space.txt",
+      "bom_spaces.txt",
       "dos_endings.txt",
       "heredocs_with_fake_newlines.txt",
       "heredocs_with_ignored_newlines.txt",
@@ -92,7 +94,7 @@ def test_lexer
       assert_equal(expected, lexer.parse[0].to_a)
       assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
 
-      assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
+      assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
       assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
     end
 
@@ -121,15 +123,17 @@ def assert_ripper_sexp_raw(source)
     def assert_ripper_lex(source)
       prism = Translation::Ripper.lex(source)
       ripper = Ripper.lex(source)
-      ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
-      ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order
+
+      # Prism emits tokens by their order in the code, not in parse order
+      ripper.sort_by! { |elem| elem[0] }
 
       [prism.size, ripper.size].max.times do |i|
         expected = ripper[i]
         actual = prism[i]
+
         # Since tokens related to heredocs are not emitted in the same order,
         # the state also doesn't line up.
-        if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
+        if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
           expected[3] = actual[3] = nil
         end
 

From 58f1127b51cf4fbb1f334f8701a041f40701dca2 Mon Sep 17 00:00:00 2001
From: Hiroshi SHIBATA <hsbt@ruby-lang.org>
Date: Tue, 20 Jan 2026 19:10:16 +0900
Subject: [PATCH 09/10] Revert "[ruby/prism] Add Ripper :on_sp events for
 Prism.lex_compat and Prism::Translation::Ripper"

This reverts commit 35a7b5159f39de2cac848c072674e5350cc41aa4.

This broke syntax_suggest.

https://github.com/ruby/ruby/actions/runs/21167011751/job/60874111912
---
 lib/prism.rb                              |   8 +-
 lib/prism/lex_compat.rb                   | 101 ++--------------------
 lib/prism/lex_ripper.rb                   |   2 +
 test/prism/fixtures/bom_leading_space.txt |   1 -
 test/prism/fixtures/bom_spaces.txt        |   1 -
 test/prism/ruby/ripper_test.rb            |  12 +--
 6 files changed, 19 insertions(+), 106 deletions(-)
 delete mode 100644 test/prism/fixtures/bom_leading_space.txt
 delete mode 100644 test/prism/fixtures/bom_spaces.txt

diff --git a/lib/prism.rb b/lib/prism.rb
index dab3420377214f..d809557fce101f 100644
--- a/lib/prism.rb
+++ b/lib/prism.rb
@@ -61,7 +61,8 @@ def initialize(version)
   #   Prism::lex_compat(source, **options) -> LexCompat::Result
   #
   # Returns a parse result whose value is an array of tokens that closely
-  # resembles the return value of Ripper::lex.
+  # resembles the return value of Ripper::lex. The main difference is that the
+  # `:on_sp` token is not emitted.
   #
   # For supported options, see Prism::parse.
   def self.lex_compat(source, **options)
@@ -71,8 +72,9 @@ def self.lex_compat(source, **options)
   # :call-seq:
   #   Prism::lex_ripper(source) -> Array
   #
-  # This wraps the result of Ripper.lex. It produces almost exactly the
-  # same tokens. Raises SyntaxError if the syntax in source is invalid.
+  # This lexes with the Ripper lex. It drops any space events but otherwise
+  # returns the same tokens. Raises SyntaxError if the syntax in source is
+  # invalid.
   def self.lex_ripper(source)
     LexRipper.new(source).result # steep:ignore
   end
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
index 597e63c73e73b7..f7b9a0effc969d 100644
--- a/lib/prism/lex_compat.rb
+++ b/lib/prism/lex_compat.rb
@@ -226,7 +226,7 @@ def state
     end
 
     # Tokens where state should be ignored
-    # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
+    # used for :on_comment, :on_heredoc_end, :on_embexpr_end
     class IgnoreStateToken < Token
       def ==(other) # :nodoc:
         self[0...-1] == other[0...-1]
@@ -611,10 +611,10 @@ def self.build(opening)
     BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
     private_constant :BOM_FLUSHED
 
-    attr_reader :options
+    attr_reader :source, :options
 
-    def initialize(code, **options)
-      @code = code
+    def initialize(source, **options)
+      @source = source
       @options = options
     end
 
@@ -624,14 +624,12 @@ def result
       state = :default
       heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
 
-      result = Prism.lex(@code, **options)
-      source = result.source
+      result = Prism.lex(source, **options)
       result_value = result.value
       previous_state = nil #: State?
       last_heredoc_end = nil #: Integer?
-      eof_token = nil
 
-      bom = source.slice(0, 3) == "\xEF\xBB\xBF"
+      bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
 
       result_value.each_with_index do |(token, lex_state), index|
         lineno = token.location.start_line
@@ -743,7 +741,6 @@ def result
 
             Token.new([[lineno, column], event, value, lex_state])
           when :on_eof
-            eof_token = token
             previous_token = result_value[index - 1][0]
 
             # If we're at the end of the file and the previous token was a
@@ -766,7 +763,7 @@ def result
                   end_offset += 3
                 end
 
-                tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
+                tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
               end
             end
 
@@ -860,89 +857,7 @@ def result
       # We sort by location to compare against Ripper's output
       tokens.sort_by!(&:location)
 
-      # Add :on_sp tokens
-      tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)
-
-      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
-    end
-
-    def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
-      new_tokens = []
-
-      prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG)
-      prev_token_end = bom ? 3 : 0
-
-      tokens.each do |token|
-        line, column = token.location
-        start_offset = source.line_to_byte_offset(line) + column
-        # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset
-        start_offset += 3 if line == 1 && bom
-
-        if start_offset > prev_token_end
-          sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
-          sp_line = source.line(prev_token_end)
-          sp_column = source.column(prev_token_end)
-          # Ripper reports columns on line 1 without counting the BOM
-          sp_column -= 3 if sp_line == 1 && bom
-          continuation_index = sp_value.byteindex("\\")
-
-          # ripper emits up to three :on_sp tokens when line continuations are used
-          if continuation_index
-            next_whitespace_index = continuation_index + 1
-            next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
-            next_whitespace_index += 1
-            first_whitespace = sp_value[0...continuation_index]
-            continuation = sp_value[continuation_index...next_whitespace_index]
-            second_whitespace = sp_value[next_whitespace_index..]
-
-            new_tokens << IgnoreStateToken.new([
-              [sp_line, sp_column],
-              :on_sp,
-              first_whitespace,
-              prev_token_state
-            ]) unless first_whitespace.empty?
-
-            new_tokens << IgnoreStateToken.new([
-              [sp_line, sp_column + continuation_index],
-              :on_sp,
-              continuation,
-              prev_token_state
-            ])
-
-            new_tokens << IgnoreStateToken.new([
-              [sp_line + 1, 0],
-              :on_sp,
-              second_whitespace,
-              prev_token_state
-            ]) unless second_whitespace.empty?
-          else
-            new_tokens << IgnoreStateToken.new([
-              [sp_line, sp_column],
-              :on_sp,
-              sp_value,
-              prev_token_state
-            ])
-          end
-        end
-
-        new_tokens << token
-        prev_token_state = token.state
-        prev_token_end = start_offset + token.value.bytesize
-      end
-
-      unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
-        end_offset = eof_token.location.end_offset
-        if prev_token_end < end_offset
-          new_tokens << IgnoreStateToken.new([
-            [source.line(prev_token_end), source.column(prev_token_end)],
-            :on_sp,
-            source.slice(prev_token_end, end_offset - prev_token_end),
-            prev_token_state
-          ])
-        end
-      end
-
-      new_tokens
+      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
     end
   end
 
diff --git a/lib/prism/lex_ripper.rb b/lib/prism/lex_ripper.rb
index 2054cf55ac0c70..4b5c3b77fd6112 100644
--- a/lib/prism/lex_ripper.rb
+++ b/lib/prism/lex_ripper.rb
@@ -19,6 +19,8 @@ def result
 
       lex(source).each do |token|
         case token[1]
+        when :on_sp
+          # skip
         when :on_tstring_content
           if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
             previous[2] << token[2]
diff --git a/test/prism/fixtures/bom_leading_space.txt b/test/prism/fixtures/bom_leading_space.txt
deleted file mode 100644
index 48d3ee50ea47b0..00000000000000
--- a/test/prism/fixtures/bom_leading_space.txt
+++ /dev/null
@@ -1 +0,0 @@
-﻿ p (42)
diff --git a/test/prism/fixtures/bom_spaces.txt b/test/prism/fixtures/bom_spaces.txt
deleted file mode 100644
index c18ad4c21ad7e7..00000000000000
--- a/test/prism/fixtures/bom_spaces.txt
+++ /dev/null
@@ -1 +0,0 @@
-﻿p ( 42 )
diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb
index 280abd94ea3e64..2a0504c19f35f0 100644
--- a/test/prism/ruby/ripper_test.rb
+++ b/test/prism/ruby/ripper_test.rb
@@ -39,8 +39,6 @@ class RipperTest < TestCase
 
     # Skip these tests that we haven't implemented yet.
     omitted_sexp_raw = [
-      "bom_leading_space.txt",
-      "bom_spaces.txt",
       "dos_endings.txt",
       "heredocs_with_fake_newlines.txt",
       "heredocs_with_ignored_newlines.txt",
@@ -94,7 +92,7 @@ def test_lexer
       assert_equal(expected, lexer.parse[0].to_a)
       assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
 
-      assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
+      assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
       assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
     end
 
@@ -123,17 +121,15 @@ def assert_ripper_sexp_raw(source)
     def assert_ripper_lex(source)
       prism = Translation::Ripper.lex(source)
       ripper = Ripper.lex(source)
-
-      # Prism emits tokens by their order in the code, not in parse order
-      ripper.sort_by! { |elem| elem[0] }
+      ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
+      ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order
 
       [prism.size, ripper.size].max.times do |i|
         expected = ripper[i]
         actual = prism[i]
-
         # Since tokens related to heredocs are not emitted in the same order,
         # the state also doesn't line up.
-        if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
+        if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
           expected[3] = actual[3] = nil
         end
 

From 300927b4bb1b41e9e848c063f2ca6109423a1729 Mon Sep 17 00:00:00 2001
From: Nobuyoshi Nakada <nobu@ruby-lang.org>
Date: Tue, 20 Jan 2026 18:37:24 +0900
Subject: [PATCH 10/10] [Bug #21845] Rebuild gem extensions at RUBY_ABI_VERSION
 change

As `TARGET_SO_DIR_TIMESTAMP` contains `ruby_version`, after bumping
`RUBY_ABI_VERSION` it should not be existing.  Usually such outdated
files will be removed by `make outdate-bundled-gems` automatically
invoked by `make up`.
---
 ext/extmk.rb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ext/extmk.rb b/ext/extmk.rb
index 8f847f4f3ab619..578e1cfa01d004 100755
--- a/ext/extmk.rb
+++ b/ext/extmk.rb
@@ -592,6 +592,7 @@ def create_makefile(*args, &block)
 build_complete = $(TARGET_GEM_DIR)/gem.build_complete
 install-so: build_complete
 clean-so:: clean-build_complete
+$(build_complete) $(OBJS): $(TARGET_SO_DIR_TIMESTAMP)
 
 build_complete: $(build_complete)
 $(build_complete): $(TARGET_SO)