From b5d156317464088ce6e6311d0247f6a74e4b3fd2 Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Sat, 10 Jan 2026 06:52:22 +0400 Subject: [PATCH 1/3] feat: add multi-byte encoders except iso-2022-jp --- fallback/multi-byte.js | 290 +++++++++++++++++++++++++++++++- fallback/multi-byte.table.js | 1 + multi-byte.js | 13 +- multi-byte.node.js | 14 +- tests/multi-byte.encode.test.js | 264 +++++++++++++++++++++++++++++ tests/wpt/loader.cjs | 16 ++ 6 files changed, 593 insertions(+), 5 deletions(-) create mode 100644 tests/multi-byte.encode.test.js diff --git a/fallback/multi-byte.js b/fallback/multi-byte.js index 847d3a1..d829782 100644 --- a/fallback/multi-byte.js +++ b/fallback/multi-byte.js @@ -1,9 +1,9 @@ -import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js' +import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2, encodeAscii } from './latin1.js' import { getTable } from './multi-byte.table.js' export const E_STRICT = 'Input is not well-formed for this encoding' -// TODO: optimize +/* Decoders */ // If the decoder is not cleared properly, state can be preserved between non-streaming calls! // See comment about fatal stream @@ -504,3 +504,289 @@ export function multibyteDecoder(enc, loose = false) { return res + mapper.decode(arr, res.length, arr.length, stream) } } + +/* Encoders */ + +// TODO: optimize, check memory usage? +// TODO: just precalculate all bytes and store offsets in one large u8? + +const e7 = new Map([[148, 236], [149, 237], [150, 243]]) // prettier-ignore +const e8 = new Map([[30, 89], [38, 97], [43, 102], [44, 103], [50, 109], [67, 126], [84, 144], [100, 160]]) // prettier-ignore +const maps = new Map() + +// We accept that encoders use non-trivial amount of mem, for perf +// most are are 128 KiB mem, big5 is 380 KiB, lazy-loaded at first use +function getMap(id, name = id) { + const cached = maps.get(id) + if (cached) return cached + const table = getTable(name) + const map = new Uint16Array(id === 'big5' ? 0x2_f8_a7 : 0xff_e7) // max codepoint in table + 1 + for (let i = 0; i < table.length; i++) { + const c = table[i] + if (c === REP || c === undefined) continue + if (id === 'big5') { + if (i < 5024) continue // this also skips multi-codepoint strings + // In big5, all return first entries except for these + if ( + map[c] && + c !== 0x25_50 && + c !== 0x25_5e && + c !== 0x25_61 && + c !== 0x25_6a && + c !== 0x53_41 && + c !== 0x53_45 + ) { + continue + } + } else { + if (id === 'shift_jis' && i >= 8272 && i <= 8835) continue + if (map[c]) continue + } + + if (typeof c === 'string') { + // always a single codepoint here + map[c.codePointAt(0)] = 1 + i + } else if (c !== REP) { + map[c] = 1 + i + } + } + + if (id === 'shift_jis' || id === 'euc-jp') map[0x22_12] = map[0xff_0d] + maps.set(id, map) + return map +} + +/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */ + +const encoders = { + big5: (err) => { + const map = getMap('big5') + const encode = (u8, i, cp) => { + let p = map[cp] + if (!p) return err(cp) + p-- + const t = p % 157 + u8[i] = 0x81 + ((p / 157) | 0) + u8[i + 1] = (t < 0x3f ? 0x40 : 0x62) + t + return 2 + } + + return { encode, ascii: 0x80 } + }, + 'euc-kr': (err) => { + const map = getMap('euc-kr') + const encode = (u8, i, cp) => { + let p = map[cp] + if (!p) return err(cp) + p-- + u8[i] = 0x81 + ((p / 190) | 0) + u8[i + 1] = (p % 190) + 0x41 + return 2 + } + + return { encode, ascii: 0x80 } + }, + 'euc-jp': (err) => { + const map = getMap('euc-jp', 'jis0208') + const encode = (u8, i, cp) => { + if (cp === 0xa5) { + u8[i] = 0x5c + return 1 + } + + if (cp === 0x20_3e) { + u8[i] = 0x7e + return 1 + } + + if (cp >= 0xff_61 && cp <= 0xff_9f) { + u8[i] = 0x8e + u8[i + 1] = cp - 0xfe_c0 + return 2 + } + + let p = map[cp] + if (!p) return err(cp) + p-- + u8[i] = ((p / 94) | 0) + 0xa1 + u8[i + 1] = (p % 94) + 0xa1 + return 2 + } + + return { encode, ascii: 0x80 } + }, + shift_jis: (err) => { + const map = getMap('shift_jis', 'jis0208') + const encode = (u8, i, cp) => { + if (cp === 0xa5) { + u8[i] = 0x5c + return 1 + } + + if (cp === 0x20_3e) { + u8[i] = 0x7e + return 1 + } + + if (cp >= 0xff_61 && cp <= 0xff_9f) { + u8[i] = cp - 0xfe_c0 + return 1 + } + + let p = map[cp] + if (!p) return err(cp) + p-- + const l = (p / 188) | 0 + const t = p % 188 + u8[i] = (l < 0x1f ? 0x81 : 0xc1) + l + u8[i + 1] = (t < 0x3f ? 0x40 : 0x41) + t + return 2 + } + + return { encode, ascii: 0x81 } + }, + gbk: (err) => { + const map = getMap('gb18030') + + const encode = (u8, i, cp) => { + if (cp === 0xe5_e5) return err(cp) + if (cp === 0x20_ac) { + u8[i] = 0x80 + return 1 + } + + if (cp >= 0xe7_8d && cp <= 0xe8_64) { + if (cp <= 0xe7_93) { + u8[i] = 0xa6 + u8[i + 1] = cp - 0xe6_b4 + return 2 + } + + const l = cp < 0xe8_00 ? 0xa6 : 0xfe + const t = (l === 0xa6 ? e7 : e8).get(cp & 0xff) + if (t) { + u8[i] = l + u8[i + 1] = t + return 2 + } + } + + let p = map[cp] + if (p) { + p-- + const t = p % 190 + u8[i] = 0x81 + ((p / 190) | 0) + u8[i + 1] = (t < 0x3f ? 0x40 : 0x41) + t + return 2 + } + + return err(cp) + } + + return { encode, ascii: 0x80, width: 2 } + }, + gb18030: (err) => { + const map = getMap('gb18030') + const gb18030r = getTable('gb18030-ranges') + + const encode = (u8, i, cp) => { + if (cp === 0xe5_e5) return err(cp) + if (cp >= 0xe7_8d && cp <= 0xe8_64) { + if (cp <= 0xe7_93) { + u8[i] = 0xa6 + u8[i + 1] = cp - 0xe6_b4 + return 2 + } + + const l = cp < 0xe8_00 ? 0xa6 : 0xfe + const t = (l === 0xa6 ? e7 : e8).get(cp & 0xff) + if (t) { + u8[i] = l + u8[i + 1] = t + return 2 + } + } + + let p = map[cp] + if (p) { + p-- + const t = p % 190 + u8[i] = 0x81 + ((p / 190) | 0) + u8[i + 1] = (t < 0x3f ? 0x40 : 0x41) + t + return 2 + } + + let a = 0, b = 0 // prettier-ignore + for (const [c, d] of gb18030r) { + if (d > cp) break + a = c + b = d + } + + let rp = cp === 0xe7_c7 ? 7457 : a + cp - b + u8[i] = 0x81 + ((rp / 12_600) | 0) + rp %= 12_600 + u8[i + 1] = 0x30 + ((rp / 1260) | 0) + rp %= 1260 + u8[i + 2] = 0x81 + ((rp / 10) | 0) + u8[i + 3] = 0x30 + (rp % 10) + return 4 + } + + return { encode, ascii: 0x80, width: 4 } + }, +} + +/* eslint-enable @exodus/mutable/no-param-reassign-prop-only */ + +const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex + +export function multibyteEncoder(enc) { + if (!Object.hasOwn(encoders, enc)) throw new RangeError('Unsupported encoding') + + // eslint-disable-next-line unicorn/consistent-function-scoping + const onErr = (code) => { + throw new TypeError(E_STRICT) + } + + const { encode, ascii, width = 2 } = encoders[enc](onErr) + return (str) => { + if (!NON_LATIN.test(str)) { + try { + return encodeAscii(str, E_STRICT) + } catch {} + } + + const length = str.length + const u8 = new Uint8Array(length * width) + let i = 0 + while (i < length) { + const x0 = str.charCodeAt(i) + if (x0 >= 128) break + u8[i++] = x0 + } + + for (let j = i; j < length; j++) { + const x0 = str.charCodeAt(j) + if (x0 < ascii) { + u8[i++] = x0 + } else if (x0 >= 0xd8_00 && x0 < 0xe0_00) { + if (x0 >= 0xdc_00 || j + 1 === length) { + onErr(x0) // Lone surrogate, TODO: how to handle this in non-strict? + } else { + const x1 = str.charCodeAt(j + 1) + if (x1 < 0xdc_00 || x1 > 0xe0_00) { + onErr(x0) // Lone surrogate, TODO: how to handle this in non-strict? + } else { + j++ // consume x1 + i += encode(u8, i, 0x1_00_00 + ((x1 & 0x3_ff) | ((x0 & 0x3_ff) << 10))) + } + } + } else { + i += encode(u8, i, x0) + } + } + + return i === u8.length ? u8 : u8.subarray(0, i) + } +} diff --git a/fallback/multi-byte.table.js b/fallback/multi-byte.table.js index 1391eb0..967ff68 100644 --- a/fallback/multi-byte.table.js +++ b/fallback/multi-byte.table.js @@ -104,6 +104,7 @@ export function getTable(id) { res = new Array(sizes[id]) // array of strings or undefined unwrap(res, indices[id], 0, true) // Pointer code updates are embedded into the table + // These are skipped in encoder as encoder uses only pointers >= (0xA1 - 0x81) * 157 res[1133] = '\xCA\u0304' res[1135] = '\xCA\u030C' res[1164] = '\xEA\u0304' diff --git a/multi-byte.js b/multi-byte.js index a6ebd19..fc485ec 100644 --- a/multi-byte.js +++ b/multi-byte.js @@ -1,5 +1,6 @@ import { assertUint8 } from './assert.js' -import { multibyteDecoder } from './fallback/multi-byte.js' +import { E_STRING } from './fallback/_utils.js' +import { multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js' export function createMultibyteDecoder(encoding, loose = false) { const jsDecoder = multibyteDecoder(encoding, loose) // asserts @@ -11,3 +12,13 @@ export function createMultibyteDecoder(encoding, loose = false) { return jsDecoder(arr, stream) } } + +export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) { + // TODO: replacement, truncate (replacement will need varying length) + if (mode !== 'fatal') throw new Error('Unsupported mode') + const jsEncoder = multibyteEncoder(encoding) // asserts + return (s) => { + if (typeof s !== 'string') throw new TypeError(E_STRING) + return jsEncoder(s) + } +} diff --git a/multi-byte.node.js b/multi-byte.node.js index 1cdb28d..f6e2bd5 100644 --- a/multi-byte.node.js +++ b/multi-byte.node.js @@ -1,6 +1,6 @@ import { assertUint8 } from './assert.js' -import { isDeno, toBuf } from './fallback/_utils.js' -import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js' +import { isDeno, toBuf, E_STRING } from './fallback/_utils.js' +import { isAsciiSuperset, multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js' import { isAscii } from 'node:buffer' export function createMultibyteDecoder(encoding, loose = false) { @@ -21,3 +21,13 @@ export function createMultibyteDecoder(encoding, loose = false) { return jsDecoder(arr, stream) } } + +export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) { + // TODO: replacement, truncate (replacement will need varying length) + if (mode !== 'fatal') throw new Error('Unsupported mode') + const jsEncoder = multibyteEncoder(encoding) // asserts + return (s) => { + if (typeof s !== 'string') throw new TypeError(E_STRING) + return jsEncoder(s) + } +} diff --git a/tests/multi-byte.encode.test.js b/tests/multi-byte.encode.test.js new file mode 100644 index 0000000..9b930dc --- /dev/null +++ b/tests/multi-byte.encode.test.js @@ -0,0 +1,264 @@ +import { createMultibyteDecoder, createMultibyteEncoder } from '@exodus/bytes/multi-byte.js' +import { test, describe } from 'node:test' +import { readFileSync } from 'node:fs' +import { join } from 'node:path' + +describe('multi-byte encodings are supersets of ascii', () => { + // Except iso-2022-jp + for (const encoding of ['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030']) { + test(encoding, (t) => { + const decoder = createMultibyteDecoder(encoding) + const encoder = createMultibyteEncoder(encoding) + for (let i = 0; i < 128; i++) { + let str + try { + str = decoder(Uint8Array.of(i)) + } catch (cause) { + throw new Error(`Error decoding ${i} in ${encoding}`, { cause }) + } + + t.assert.strictEqual(str.length, 1, i) + t.assert.strictEqual(str.codePointAt(0), i, i) + + t.assert.deepStrictEqual(encoder(str), Uint8Array.of(i)) + } + }) + } +}) + +// https://encoding.spec.whatwg.org/#gb18030-encoder step 5 +const gbExceptions = { + E78D: Uint8Array.of(0xa6, 0xd9), + E78E: Uint8Array.of(0xa6, 0xda), + E78F: Uint8Array.of(0xa6, 0xdb), + E790: Uint8Array.of(0xa6, 0xdc), + E791: Uint8Array.of(0xa6, 0xdd), + E792: Uint8Array.of(0xa6, 0xde), + E793: Uint8Array.of(0xa6, 0xdf), + E794: Uint8Array.of(0xa6, 0xec), + E795: Uint8Array.of(0xa6, 0xed), + E796: Uint8Array.of(0xa6, 0xf3), + E81E: Uint8Array.of(0xfe, 0x59), + E826: Uint8Array.of(0xfe, 0x61), + E82B: Uint8Array.of(0xfe, 0x66), + E82C: Uint8Array.of(0xfe, 0x67), + E832: Uint8Array.of(0xfe, 0x6d), + E843: Uint8Array.of(0xfe, 0x7e), + E854: Uint8Array.of(0xfe, 0x90), + E864: Uint8Array.of(0xfe, 0xa0), +} + +describe('specific tests', () => { + test('big5', (t) => { + const enc = createMultibyteEncoder('big5') + const dec = createMultibyteDecoder('big5') + + // https://encoding.spec.whatwg.org/#index-big5-pointer + // If codePoint is U+2550 (═), U+255E (╞), U+2561 (╡), U+256A (╪), U+5341 (十), or U+5345 (卅), + // then return the last pointer corresponding to codePoint in index. + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa4)), String.fromCodePoint(0x25_50)) // 5247 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xf9)), String.fromCodePoint(0x25_50)) // 18991 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_50)), Uint8Array.of(0xf9, 0xf9)) // 18991 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa5)), String.fromCodePoint(0x25_5e)) // 5248 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xe9)), String.fromCodePoint(0x25_5e)) // 18975 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_5e)), Uint8Array.of(0xf9, 0xe9)) // 18975 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa7)), String.fromCodePoint(0x25_61)) // 5250 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xeb)), String.fromCodePoint(0x25_61)) // 18977 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_61)), Uint8Array.of(0xf9, 0xeb)) // 18977 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa6)), String.fromCodePoint(0x25_6a)) // 5249 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xea)), String.fromCodePoint(0x25_6a)) // 18976 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_6a)), Uint8Array.of(0xf9, 0xea)) // 18976 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xcc)), String.fromCodePoint(0x53_41)) // 5287 + t.assert.strictEqual(dec(Uint8Array.of(0xa4, 0x51)), String.fromCodePoint(0x53_41)) // 5512 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x53_41)), Uint8Array.of(0xa4, 0x51)) // 5512 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xce)), String.fromCodePoint(0x53_45)) // 5289 + t.assert.strictEqual(dec(Uint8Array.of(0xa4, 0xca)), String.fromCodePoint(0x53_45)) // 5599 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x53_45)), Uint8Array.of(0xa4, 0xca)) // 5599 + + // But not others, which return first codepoint in index + t.assert.strictEqual(dec(Uint8Array.of(0xa1, 0xb2)), String.fromCodePoint(0x30_03)) // 5104 + t.assert.strictEqual(dec(Uint8Array.of(0xc6, 0xde)), String.fromCodePoint(0x30_03)) // 10957 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x30_03)), Uint8Array.of(0xa1, 0xb2)) // 5104 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xcd)), String.fromCodePoint(0x53_44)) // 5288 + t.assert.strictEqual(dec(Uint8Array.of(0xfa, 0xc5)), String.fromCodePoint(0x53_44)) // 19096 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x53_44)), Uint8Array.of(0xa2, 0xcd)) // 5288 + }) + + test('shift_jis', (t) => { + const enc = createMultibyteEncoder('shift_jis') + const dec = createMultibyteDecoder('shift_jis') + + // https://encoding.spec.whatwg.org/#shift_jis-encoder + t.assert.deepStrictEqual(enc('\u007F'), Uint8Array.of(0x7f)) + t.assert.deepStrictEqual(enc('\u0080'), Uint8Array.of(0x80)) // If codePoint is an ASCII code point or U+0080, then return a byte whose value is codePoint. + t.assert.deepStrictEqual(enc('\u00A5'), Uint8Array.of(0x5c)) // If codePoint is U+00A5 (¥), then return byte 0x5C. + t.assert.deepStrictEqual(enc('\u203E'), Uint8Array.of(0x7e)) // If codePoint is U+203E (‾), then return byte 0x7E. + t.assert.deepStrictEqual(enc('\u2212'), enc('\uFF0D')) // If codePoint is U+2212 (−), then set it to U+FF0D (-). + t.assert.strictEqual(dec(enc('\uFF0D')), '\uFF0D') + t.assert.strictEqual(dec(enc('\u2212')), '\uFF0D') + + for (let i = 0xff_61; i <= 0xff_9f; i++) { + const str = String.fromCodePoint(i) + t.assert.deepStrictEqual(enc(str), Uint8Array.of(i - 0xff_61 + 0xa1)) + t.assert.strictEqual(dec(enc(str)), str) + } + }) + + test('euc-jp', (t) => { + const enc = createMultibyteEncoder('euc-jp') + const dec = createMultibyteDecoder('euc-jp') + + // https://encoding.spec.whatwg.org/#euc-jp-encoder + t.assert.deepStrictEqual(enc('\u007F'), Uint8Array.of(0x7f)) + t.assert.throws(() => enc('\u0080')) + t.assert.deepStrictEqual(enc('\u00A5'), Uint8Array.of(0x5c)) // If codePoint is U+00A5 (¥), then return byte 0x5C. + t.assert.deepStrictEqual(enc('\u203E'), Uint8Array.of(0x7e)) // If codePoint is U+203E (‾), then return byte 0x7E. + t.assert.deepStrictEqual(enc('\u2212'), enc('\uFF0D')) // If codePoint is U+2212 (−), then set it to U+FF0D (-). + t.assert.strictEqual(dec(enc('\uFF0D')), '\uFF0D') + t.assert.strictEqual(dec(enc('\u2212')), '\uFF0D') + for (let i = 0xff_61; i <= 0xff_9f; i++) { + const str = String.fromCodePoint(i) + t.assert.deepStrictEqual(enc(str), Uint8Array.of(0x8e, i - 0xff_61 + 0xa1)) + t.assert.strictEqual(dec(enc(str)), str) + } + }) + + test('euc-kr', (t) => { + const enc = createMultibyteEncoder('euc-kr') + + // https://encoding.spec.whatwg.org/#euc-kr-encoder + t.assert.deepStrictEqual(enc('\u007F'), Uint8Array.of(0x7f)) + t.assert.throws(() => enc('\u0080')) + }) + + test('gb18030, gbk', (t) => { + // gb18030 can encode replacement + t.assert.throws(() => createMultibyteEncoder('gbk')('\uFFFD')) // gbk can't encode it + const rep = createMultibyteEncoder('gb18030')('\uFFFD') + t.assert.strictEqual(createMultibyteDecoder('gb18030')(rep), '\uFFFD') + t.assert.deepStrictEqual(rep, Uint8Array.of(0x84, 0x31, 0xa4, 0x37)) // pointer 39417, valid representation for the replacement char + + // https://encoding.spec.whatwg.org/#gb18030-encoder + // 3. If codePoint is U+E5E5, then return error with codePoint. + t.assert.throws(() => createMultibyteEncoder('gbk')('\uE5E5')) // not present in index so doesn't need special handling + t.assert.throws(() => createMultibyteEncoder('gb18030')('\uE5E5')) // excluded from ranges via a specific check + + // gbk and gb18030 encode U+20AC differently, but decode both variants + // https://encoding.spec.whatwg.org/#gb18030-encoder + // 4. If is GBK is true and codePoint is U+20AC (€), then return byte 0x80. + t.assert.deepStrictEqual(createMultibyteEncoder('gb18030')('\u20AC'), Uint8Array.of(0xa2, 0xe3)) + t.assert.deepStrictEqual(createMultibyteEncoder('gbk')('\u20AC'), Uint8Array.of(0x80)) + t.assert.strictEqual(createMultibyteDecoder('gb18030')(Uint8Array.of(0xa2, 0xe3)), '\u20AC') + t.assert.strictEqual(createMultibyteDecoder('gb18030')(Uint8Array.of(0x80)), '\u20AC') + t.assert.strictEqual(createMultibyteDecoder('gbk')(Uint8Array.of(0xa2, 0xe3)), '\u20AC') + t.assert.strictEqual(createMultibyteDecoder('gbk')(Uint8Array.of(0x80)), '\u20AC') + + for (const encoding of ['gb18030', 'gbk']) { + const enc = createMultibyteEncoder(encoding) + for (const [hex, u8] of Object.entries(gbExceptions)) { + t.assert.doesNotThrow( + () => t.assert.deepStrictEqual(enc(String.fromCodePoint(parseInt(hex, 16))), u8), + `${encoding}(U+${hex})` + ) + } + } + }) +}) + +function loadTable(encoding, t) { + const text = readFileSync( + join(import.meta.dirname, 'encoding/fixtures/multi-byte', `index-${encoding}.txt`), + 'utf8' + ) + + const rows = text + .split('\n') + .map((x) => x.trim()) + .filter((x) => x && x[0] !== '#') + .map((x) => x.split('\t')) + .map(([istr, codeHex, description]) => { + const i = Number(istr) + const code = parseInt(codeHex.slice(2), 16) + t.assert.strictEqual(`${i}`, istr) + t.assert.strictEqual('0x' + code.toString(16).padStart(4, '0').toUpperCase(), codeHex) + return { i, code, description } + }) + + t.assert.strictEqual(rows.length, new Set(rows.map((row) => row.i)).size) // all unique + return rows +} + +describe('roundtrip, tables', () => { + const encodings = { + big5: 'big5', + shift_jis: 'jis0208', + 'euc-jp': 'jis0208', + 'euc-kr': 'euc-kr', + gbk: 'gb18030', + gb18030: 'gb18030', + } + + for (const [encoding, tableID] of Object.entries(encodings)) { + test(encoding, (t) => { + const enc = createMultibyteEncoder(encoding) + const dec = createMultibyteDecoder(encoding) + const table = loadTable(tableID, t) + const last = new Map(table.map(({ i, code }) => [code, i])) + for (const { i, code, description } of table) { + const str = String.fromCodePoint(code) + + // https://encoding.spec.whatwg.org/#index-big5-pointer excludes low pointers + if (encoding === 'big5' && i < (0xa1 - 0x81) * 157) { + // If last seen with that code is in low pointer range, it should throw + if (last.get(code) === i) t.assert.throws(() => enc(str), description) + continue + } + + t.assert.doesNotThrow(() => t.assert.strictEqual(dec(enc(str)), str), description) + } + }) + } +}) + +describe('roundtrip, full Unicode', () => { + const MAX = 0x10_ff_ff // Max Unicode codepoint + + test('gb18030', { timeout: 60_000 }, (t) => { + const enc = createMultibyteEncoder('gb18030') + const dec = createMultibyteDecoder('gb18030') + + for (let i = 0; i <= MAX; i++) { + const s = String.fromCodePoint(i) + const id = `U+${i.toString(16).toUpperCase()}` + if (i >= 0xd8_00 && i <= 0xdf_ff) { + // Surrogates + t.assert.throws(() => enc(s), `Surrogate ${id}`) + continue + } + + // https://encoding.spec.whatwg.org/#gb18030-encoder step 3. If codePoint is U+E5E5, then return error with codePoint. + if (i === 0xe5_e5) { + t.assert.throws(() => enc(s), id) + continue + } + + let u8 + t.assert.doesNotThrow(() => { + u8 = enc(s) + }, id) + + if (Object.hasOwn(gbExceptions, i.toString(16).toUpperCase())) { + t.assert.deepStrictEqual(u8, gbExceptions[i.toString(16).toUpperCase()], id) + } else { + t.assert.strictEqual(dec(u8), s, id) + } + } + }) +}) diff --git a/tests/wpt/loader.cjs b/tests/wpt/loader.cjs index 36d94e0..e8271ba 100644 --- a/tests/wpt/loader.cjs +++ b/tests/wpt/loader.cjs @@ -2,6 +2,7 @@ const assert = require('node:assert/strict') const fs = require('node:fs') const path = require('node:path') const { describe, test } = require('node:test') +const { createMultibyteEncoder } = require('@exodus/bytes/multi-byte.js') // TextDecoderStream / TextEncoderStream implementations expect Streams to be present if (!globalThis.ReadableStream) { @@ -146,6 +147,8 @@ function loadTextDecoderHtml(fullName) { assert.ok(encoding && encoding.length > 0) const decoder = new globalThis.TextDecoder(encoding) const fatal = new globalThis.TextDecoder(encoding, { fatal: true }) + const encode = + decoder.encoding === 'iso-2022-jp' ? null : createMultibyteEncoder(decoder.encoding) // TODO: iso-2022-jp if (fullName.endsWith('_errors.html')) { const sep0 = '' @@ -216,6 +219,19 @@ function loadTextDecoderHtml(fullName) { t.assert.strictEqual(fatal.decode(bytes), expected, `${bytesHex} => U+${cpHex}`) } + // Test encoder + // This is limited, encoders are asymmetrical + if ( + !(decoder.encoding === 'euc-jp' && bytes.length === 3) && // no jis0212 encoding in spec + !(decoder.encoding === 'big5' && bytes[0] > 0x7f && bytes[0] <= 0xa0) && // encoding excludes pointers less than (0xA1 - 0x81) × 157. + decoder.encoding !== 'iso-2022-jp' // Not implemented yet + ) { + t.assert.doesNotThrow( + () => t.assert.deepEqual(encode(String.fromCodePoint(cp)), bytes), + `encode U+${cpHex} => ${bytesHex}` + ) + } + tested++ } From 79cf7bbaf0eac8fe6ad888d2390cb99cae2a2074 Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Sun, 11 Jan 2026 10:59:02 +0400 Subject: [PATCH 2/3] perf: improve multi-byte encoders perf --- fallback/multi-byte.js | 368 +++++++++++++++++------------------------ multi-byte.js | 7 +- multi-byte.node.js | 8 +- 3 files changed, 151 insertions(+), 232 deletions(-) diff --git a/fallback/multi-byte.js b/fallback/multi-byte.js index d829782..e01bc11 100644 --- a/fallback/multi-byte.js +++ b/fallback/multi-byte.js @@ -1,3 +1,4 @@ +import { E_STRING } from './_utils.js' import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2, encodeAscii } from './latin1.js' import { getTable } from './multi-byte.table.js' @@ -507,20 +508,36 @@ export function multibyteDecoder(enc, loose = false) { /* Encoders */ -// TODO: optimize, check memory usage? -// TODO: just precalculate all bytes and store offsets in one large u8? - -const e7 = new Map([[148, 236], [149, 237], [150, 243]]) // prettier-ignore -const e8 = new Map([[30, 89], [38, 97], [43, 102], [44, 103], [50, 109], [67, 126], [84, 144], [100, 160]]) // prettier-ignore const maps = new Map() +const e7 = [[148, 236], [149, 237], [150, 243]] // prettier-ignore +const e8 = [[30, 89], [38, 97], [43, 102], [44, 103], [50, 109], [67, 126], [84, 144], [100, 160]] // prettier-ignore +const preencoders = { + __proto__: null, + big5: (p) => ((((p / 157) | 0) + 0x81) << 8) | ((p % 157 < 0x3f ? 0x40 : 0x62) + (p % 157)), + shift_jis: (p) => { + const l = (p / 188) | 0 + const t = p % 188 + return ((l + (l < 0x1f ? 0x81 : 0xc1)) << 8) | ((t < 0x3f ? 0x40 : 0x41) + t) + }, + 'euc-jp': (p) => ((((p / 94) | 0) + 0xa1) << 8) | ((p % 94) + 0xa1), + 'euc-kr': (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190) + 0x41), + gb18030: (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190 < 0x3f ? 0x40 : 0x41) + (p % 190)), +} + +preencoders.gbk = preencoders.gb18030 // We accept that encoders use non-trivial amount of mem, for perf // most are are 128 KiB mem, big5 is 380 KiB, lazy-loaded at first use -function getMap(id, name = id) { +function getMap(id, size) { const cached = maps.get(id) if (cached) return cached - const table = getTable(name) - const map = new Uint16Array(id === 'big5' ? 0x2_f8_a7 : 0xff_e7) // max codepoint in table + 1 + let tname = id + const sjis = id === 'shift_jis' + if (id === 'gbk') tname = 'gb18030' + if (id === 'euc-jp' || sjis) tname = 'jis0208' + const table = getTable(tname) + const map = new Uint16Array(size) + const enc = preencoders[id] || ((p) => p + 1) for (let i = 0; i < table.length; i++) { const c = table[i] if (c === REP || c === undefined) continue @@ -539,218 +556,50 @@ function getMap(id, name = id) { continue } } else { - if (id === 'shift_jis' && i >= 8272 && i <= 8835) continue + if (sjis && i >= 8272 && i <= 8835) continue if (map[c]) continue } if (typeof c === 'string') { // always a single codepoint here - map[c.codePointAt(0)] = 1 + i + map[c.codePointAt(0)] = enc(i) } else if (c !== REP) { - map[c] = 1 + i + map[c] = enc(i) } } - if (id === 'shift_jis' || id === 'euc-jp') map[0x22_12] = map[0xff_0d] + for (let i = 0; i < 0x80; i++) map[i] = i + if (sjis || id === 'euc-jp') { + if (sjis) map[0x80] = 0x80 + const d = sjis ? 0xfe_c0 : 0x70_c0 + for (let i = 0xff_61; i <= 0xff_9f; i++) map[i] = i - d + map[0x22_12] = map[0xff_0d] + map[0xa5] = 0x5c + map[0x20_3e] = 0x7e + } else if (tname === 'gb18030') { + if (id === 'gbk') map[0x20_ac] = 0x80 + for (let i = 0xe7_8d; i <= 0xe7_93; i++) map[i] = i - 0x40_b4 + for (const [a, b] of e7) map[0xe7_00 | a] = 0xa6_00 | b + for (const [a, b] of e8) map[0xe8_00 | a] = 0xfe_00 | b + } + maps.set(id, map) return map } -/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */ - -const encoders = { - big5: (err) => { - const map = getMap('big5') - const encode = (u8, i, cp) => { - let p = map[cp] - if (!p) return err(cp) - p-- - const t = p % 157 - u8[i] = 0x81 + ((p / 157) | 0) - u8[i + 1] = (t < 0x3f ? 0x40 : 0x62) + t - return 2 - } - - return { encode, ascii: 0x80 } - }, - 'euc-kr': (err) => { - const map = getMap('euc-kr') - const encode = (u8, i, cp) => { - let p = map[cp] - if (!p) return err(cp) - p-- - u8[i] = 0x81 + ((p / 190) | 0) - u8[i + 1] = (p % 190) + 0x41 - return 2 - } - - return { encode, ascii: 0x80 } - }, - 'euc-jp': (err) => { - const map = getMap('euc-jp', 'jis0208') - const encode = (u8, i, cp) => { - if (cp === 0xa5) { - u8[i] = 0x5c - return 1 - } - - if (cp === 0x20_3e) { - u8[i] = 0x7e - return 1 - } - - if (cp >= 0xff_61 && cp <= 0xff_9f) { - u8[i] = 0x8e - u8[i + 1] = cp - 0xfe_c0 - return 2 - } - - let p = map[cp] - if (!p) return err(cp) - p-- - u8[i] = ((p / 94) | 0) + 0xa1 - u8[i + 1] = (p % 94) + 0xa1 - return 2 - } - - return { encode, ascii: 0x80 } - }, - shift_jis: (err) => { - const map = getMap('shift_jis', 'jis0208') - const encode = (u8, i, cp) => { - if (cp === 0xa5) { - u8[i] = 0x5c - return 1 - } - - if (cp === 0x20_3e) { - u8[i] = 0x7e - return 1 - } - - if (cp >= 0xff_61 && cp <= 0xff_9f) { - u8[i] = cp - 0xfe_c0 - return 1 - } - - let p = map[cp] - if (!p) return err(cp) - p-- - const l = (p / 188) | 0 - const t = p % 188 - u8[i] = (l < 0x1f ? 0x81 : 0xc1) + l - u8[i + 1] = (t < 0x3f ? 0x40 : 0x41) + t - return 2 - } - - return { encode, ascii: 0x81 } - }, - gbk: (err) => { - const map = getMap('gb18030') - - const encode = (u8, i, cp) => { - if (cp === 0xe5_e5) return err(cp) - if (cp === 0x20_ac) { - u8[i] = 0x80 - return 1 - } - - if (cp >= 0xe7_8d && cp <= 0xe8_64) { - if (cp <= 0xe7_93) { - u8[i] = 0xa6 - u8[i + 1] = cp - 0xe6_b4 - return 2 - } - - const l = cp < 0xe8_00 ? 0xa6 : 0xfe - const t = (l === 0xa6 ? e7 : e8).get(cp & 0xff) - if (t) { - u8[i] = l - u8[i + 1] = t - return 2 - } - } - - let p = map[cp] - if (p) { - p-- - const t = p % 190 - u8[i] = 0x81 + ((p / 190) | 0) - u8[i + 1] = (t < 0x3f ? 0x40 : 0x41) + t - return 2 - } - - return err(cp) - } - - return { encode, ascii: 0x80, width: 2 } - }, - gb18030: (err) => { - const map = getMap('gb18030') - const gb18030r = getTable('gb18030-ranges') - - const encode = (u8, i, cp) => { - if (cp === 0xe5_e5) return err(cp) - if (cp >= 0xe7_8d && cp <= 0xe8_64) { - if (cp <= 0xe7_93) { - u8[i] = 0xa6 - u8[i + 1] = cp - 0xe6_b4 - return 2 - } - - const l = cp < 0xe8_00 ? 0xa6 : 0xfe - const t = (l === 0xa6 ? e7 : e8).get(cp & 0xff) - if (t) { - u8[i] = l - u8[i + 1] = t - return 2 - } - } - - let p = map[cp] - if (p) { - p-- - const t = p % 190 - u8[i] = 0x81 + ((p / 190) | 0) - u8[i + 1] = (t < 0x3f ? 0x40 : 0x41) + t - return 2 - } - - let a = 0, b = 0 // prettier-ignore - for (const [c, d] of gb18030r) { - if (d > cp) break - a = c - b = d - } - - let rp = cp === 0xe7_c7 ? 7457 : a + cp - b - u8[i] = 0x81 + ((rp / 12_600) | 0) - rp %= 12_600 - u8[i + 1] = 0x30 + ((rp / 1260) | 0) - rp %= 1260 - u8[i + 2] = 0x81 + ((rp / 10) | 0) - u8[i + 3] = 0x30 + (rp % 10) - return 4 - } - - return { encode, ascii: 0x80, width: 4 } - }, -} - -/* eslint-enable @exodus/mutable/no-param-reassign-prop-only */ - +const encoders = new Set(['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030']) const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex +let gb18030r -export function multibyteEncoder(enc) { - if (!Object.hasOwn(encoders, enc)) throw new RangeError('Unsupported encoding') - - // eslint-disable-next-line unicorn/consistent-function-scoping - const onErr = (code) => { - throw new TypeError(E_STRICT) - } +export function multibyteEncoder(enc, onError) { + if (!encoders.has(enc)) throw new RangeError('Unsupported encoding') + const size = enc === 'big5' ? 0x2_f8_a7 : 0x1_00_00 // for big5, max codepoint in table + 1 + const width = enc === 'gb18030' ? 4 : 2 + const map = getMap(enc, size) + if (enc === 'gb18030' && !gb18030r) gb18030r = getTable('gb18030-ranges') - const { encode, ascii, width = 2 } = encoders[enc](onErr) return (str) => { + if (typeof str !== 'string') throw new TypeError(E_STRING) if (!NON_LATIN.test(str)) { try { return encodeAscii(str, E_STRICT) @@ -761,29 +610,108 @@ export function multibyteEncoder(enc) { const u8 = new Uint8Array(length * width) let i = 0 while (i < length) { - const x0 = str.charCodeAt(i) - if (x0 >= 128) break - u8[i++] = x0 + const x = str.charCodeAt(i) + if (x >= 128) break + u8[i++] = x + } + + // eslint-disable-next-line unicorn/consistent-function-scoping + const err = (code) => { + if (onError) return onError(code, u8, i) + throw new TypeError(E_STRICT) } - for (let j = i; j < length; j++) { - const x0 = str.charCodeAt(j) - if (x0 < ascii) { - u8[i++] = x0 - } else if (x0 >= 0xd8_00 && x0 < 0xe0_00) { - if (x0 >= 0xdc_00 || j + 1 === length) { - onErr(x0) // Lone surrogate, TODO: how to handle this in non-strict? + if (!map || map.length < size) throw new Error('Unreachable') // Important for perf + if (enc === 'gb18030') { + // Deduping this branch hurts other encoders perf + const encode = (cp) => { + let a = 0, b = 0 // prettier-ignore + for (const [c, d] of gb18030r) { + if (d > cp) break + a = c + b = d + } + + let rp = cp === 0xe7_c7 ? 7457 : a + cp - b + u8[i++] = 0x81 + ((rp / 12_600) | 0) + rp %= 12_600 + u8[i++] = 0x30 + ((rp / 1260) | 0) + rp %= 1260 + u8[i++] = 0x81 + ((rp / 10) | 0) + u8[i++] = 0x30 + (rp % 10) + } + + for (let j = i; j < length; j++) { + const x = str.charCodeAt(j) + if (x >= 0xd8_00 && x < 0xe0_00) { + if (x >= 0xdc_00 || j + 1 === length) { + i += err(x) // lone + } else { + const x1 = str.charCodeAt(j + 1) + if (x1 < 0xdc_00 || x1 >= 0xe0_00) { + i += err(x) // lone + } else { + j++ // consume x1 + encode(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10))) + } + } } else { - const x1 = str.charCodeAt(j + 1) - if (x1 < 0xdc_00 || x1 > 0xe0_00) { - onErr(x0) // Lone surrogate, TODO: how to handle this in non-strict? + const e = map[x] + if (e & 0xff_00) { + u8[i++] = e >> 8 + u8[i++] = e & 0xff + } else if (e || x === 0) { + u8[i++] = e + } else if (x === 0xe5_e5) { + i += err(x) } else { - j++ // consume x1 - i += encode(u8, i, 0x1_00_00 + ((x1 & 0x3_ff) | ((x0 & 0x3_ff) << 10))) + encode(x) + } + } + } + } else { + const long = + enc === 'big5' + ? (x) => { + const e = map[x] + if (e & 0xff_00) { + u8[i++] = e >> 8 + u8[i++] = e & 0xff + } else if (e || x === 0) { + u8[i++] = e + } else { + i += err(x) + } + } + : (x) => { + i += err(x) + } + + for (let j = i; j < length; j++) { + const x = str.charCodeAt(j) + if (x >= 0xd8_00 && x < 0xe0_00) { + if (x >= 0xdc_00 || j + 1 === length) { + i += err(x) // lone + } else { + const x1 = str.charCodeAt(j + 1) + if (x1 < 0xdc_00 || x1 >= 0xe0_00) { + i += err(x) // lone + } else { + j++ // consume x1 + long(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10))) + } + } + } else { + const e = map[x] + if (e & 0xff_00) { + u8[i++] = e >> 8 + u8[i++] = e & 0xff + } else if (e || x === 0) { + u8[i++] = e + } else { + i += err(x) } } - } else { - i += encode(u8, i, x0) } } diff --git a/multi-byte.js b/multi-byte.js index fc485ec..c7837da 100644 --- a/multi-byte.js +++ b/multi-byte.js @@ -1,5 +1,4 @@ import { assertUint8 } from './assert.js' -import { E_STRING } from './fallback/_utils.js' import { multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js' export function createMultibyteDecoder(encoding, loose = false) { @@ -16,9 +15,5 @@ export function createMultibyteDecoder(encoding, loose = false) { export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) { // TODO: replacement, truncate (replacement will need varying length) if (mode !== 'fatal') throw new Error('Unsupported mode') - const jsEncoder = multibyteEncoder(encoding) // asserts - return (s) => { - if (typeof s !== 'string') throw new TypeError(E_STRING) - return jsEncoder(s) - } + return multibyteEncoder(encoding) // asserts } diff --git a/multi-byte.node.js b/multi-byte.node.js index f6e2bd5..c8a6e1b 100644 --- a/multi-byte.node.js +++ b/multi-byte.node.js @@ -1,5 +1,5 @@ import { assertUint8 } from './assert.js' -import { isDeno, toBuf, E_STRING } from './fallback/_utils.js' +import { isDeno, toBuf } from './fallback/_utils.js' import { isAsciiSuperset, multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js' import { isAscii } from 'node:buffer' @@ -25,9 +25,5 @@ export function createMultibyteDecoder(encoding, loose = false) { export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) { // TODO: replacement, truncate (replacement will need varying length) if (mode !== 'fatal') throw new Error('Unsupported mode') - const jsEncoder = multibyteEncoder(encoding) // asserts - return (s) => { - if (typeof s !== 'string') throw new TypeError(E_STRING) - return jsEncoder(s) - } + return multibyteEncoder(encoding) // asserts } From 3e803c1c92f24db53f26739d215ce49dd0043f8f Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Sun, 11 Jan 2026 13:31:02 +0400 Subject: [PATCH 3/3] test: limit round-trip tests on slow engines --- tests/multi-byte.encode.test.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/multi-byte.encode.test.js b/tests/multi-byte.encode.test.js index 9b930dc..ea9627f 100644 --- a/tests/multi-byte.encode.test.js +++ b/tests/multi-byte.encode.test.js @@ -227,8 +227,12 @@ describe('roundtrip, tables', () => { } }) +const slowEngine = + process.env.EXODUS_TEST_PLATFORM === 'quickjs' || + process.env.EXODUS_TEST_PLATFORM === 'xs' || + process.env.EXODUS_TEST_PLATFORM === 'engine262' describe('roundtrip, full Unicode', () => { - const MAX = 0x10_ff_ff // Max Unicode codepoint + const MAX = slowEngine ? 0x1_ff_ff : 0x10_ff_ff // Max Unicode codepoint test('gb18030', { timeout: 60_000 }, (t) => { const enc = createMultibyteEncoder('gb18030')