diff --git a/fallback/multi-byte.js b/fallback/multi-byte.js index 847d3a1..e01bc11 100644 --- a/fallback/multi-byte.js +++ b/fallback/multi-byte.js @@ -1,9 +1,10 @@ -import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js' +import { E_STRING } from './_utils.js' +import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2, encodeAscii } from './latin1.js' import { getTable } from './multi-byte.table.js' export const E_STRICT = 'Input is not well-formed for this encoding' -// TODO: optimize +/* Decoders */ // If the decoder is not cleared properly, state can be preserved between non-streaming calls! // See comment about fatal stream @@ -504,3 +505,216 @@ export function multibyteDecoder(enc, loose = false) { return res + mapper.decode(arr, res.length, arr.length, stream) } } + +/* Encoders */ + +const maps = new Map() +const e7 = [[148, 236], [149, 237], [150, 243]] // prettier-ignore +const e8 = [[30, 89], [38, 97], [43, 102], [44, 103], [50, 109], [67, 126], [84, 144], [100, 160]] // prettier-ignore +const preencoders = { + __proto__: null, + big5: (p) => ((((p / 157) | 0) + 0x81) << 8) | ((p % 157 < 0x3f ? 0x40 : 0x62) + (p % 157)), + shift_jis: (p) => { + const l = (p / 188) | 0 + const t = p % 188 + return ((l + (l < 0x1f ? 0x81 : 0xc1)) << 8) | ((t < 0x3f ? 0x40 : 0x41) + t) + }, + 'euc-jp': (p) => ((((p / 94) | 0) + 0xa1) << 8) | ((p % 94) + 0xa1), + 'euc-kr': (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190) + 0x41), + gb18030: (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190 < 0x3f ? 0x40 : 0x41) + (p % 190)), +} + +preencoders.gbk = preencoders.gb18030 + +// We accept that encoders use non-trivial amount of mem, for perf +// most are are 128 KiB mem, big5 is 380 KiB, lazy-loaded at first use +function getMap(id, size) { + const cached = maps.get(id) + if (cached) return cached + let tname = id + const sjis = id === 'shift_jis' + if (id === 'gbk') tname = 'gb18030' + if (id === 'euc-jp' || sjis) tname = 'jis0208' + const table = getTable(tname) + const map = new Uint16Array(size) + const enc = preencoders[id] || ((p) => p + 1) + for (let i = 0; i < table.length; i++) { + const c = table[i] + if (c === REP || c === undefined) continue + if (id === 'big5') { + if (i < 5024) continue // this also skips multi-codepoint strings + // In big5, all return first entries except for these + if ( + map[c] && + c !== 0x25_50 && + c !== 0x25_5e && + c !== 0x25_61 && + c !== 0x25_6a && + c !== 0x53_41 && + c !== 0x53_45 + ) { + continue + } + } else { + if (sjis && i >= 8272 && i <= 8835) continue + if (map[c]) continue + } + + if (typeof c === 'string') { + // always a single codepoint here + map[c.codePointAt(0)] = enc(i) + } else if (c !== REP) { + map[c] = enc(i) + } + } + + for (let i = 0; i < 0x80; i++) map[i] = i + if (sjis || id === 'euc-jp') { + if (sjis) map[0x80] = 0x80 + const d = sjis ? 0xfe_c0 : 0x70_c0 + for (let i = 0xff_61; i <= 0xff_9f; i++) map[i] = i - d + map[0x22_12] = map[0xff_0d] + map[0xa5] = 0x5c + map[0x20_3e] = 0x7e + } else if (tname === 'gb18030') { + if (id === 'gbk') map[0x20_ac] = 0x80 + for (let i = 0xe7_8d; i <= 0xe7_93; i++) map[i] = i - 0x40_b4 + for (const [a, b] of e7) map[0xe7_00 | a] = 0xa6_00 | b + for (const [a, b] of e8) map[0xe8_00 | a] = 0xfe_00 | b + } + + maps.set(id, map) + return map +} + +const encoders = new Set(['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030']) +const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex +let gb18030r + +export function multibyteEncoder(enc, onError) { + if (!encoders.has(enc)) throw new RangeError('Unsupported encoding') + const size = enc === 'big5' ? 0x2_f8_a7 : 0x1_00_00 // for big5, max codepoint in table + 1 + const width = enc === 'gb18030' ? 4 : 2 + const map = getMap(enc, size) + if (enc === 'gb18030' && !gb18030r) gb18030r = getTable('gb18030-ranges') + + return (str) => { + if (typeof str !== 'string') throw new TypeError(E_STRING) + if (!NON_LATIN.test(str)) { + try { + return encodeAscii(str, E_STRICT) + } catch {} + } + + const length = str.length + const u8 = new Uint8Array(length * width) + let i = 0 + while (i < length) { + const x = str.charCodeAt(i) + if (x >= 128) break + u8[i++] = x + } + + // eslint-disable-next-line unicorn/consistent-function-scoping + const err = (code) => { + if (onError) return onError(code, u8, i) + throw new TypeError(E_STRICT) + } + + if (!map || map.length < size) throw new Error('Unreachable') // Important for perf + if (enc === 'gb18030') { + // Deduping this branch hurts other encoders perf + const encode = (cp) => { + let a = 0, b = 0 // prettier-ignore + for (const [c, d] of gb18030r) { + if (d > cp) break + a = c + b = d + } + + let rp = cp === 0xe7_c7 ? 7457 : a + cp - b + u8[i++] = 0x81 + ((rp / 12_600) | 0) + rp %= 12_600 + u8[i++] = 0x30 + ((rp / 1260) | 0) + rp %= 1260 + u8[i++] = 0x81 + ((rp / 10) | 0) + u8[i++] = 0x30 + (rp % 10) + } + + for (let j = i; j < length; j++) { + const x = str.charCodeAt(j) + if (x >= 0xd8_00 && x < 0xe0_00) { + if (x >= 0xdc_00 || j + 1 === length) { + i += err(x) // lone + } else { + const x1 = str.charCodeAt(j + 1) + if (x1 < 0xdc_00 || x1 >= 0xe0_00) { + i += err(x) // lone + } else { + j++ // consume x1 + encode(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10))) + } + } + } else { + const e = map[x] + if (e & 0xff_00) { + u8[i++] = e >> 8 + u8[i++] = e & 0xff + } else if (e || x === 0) { + u8[i++] = e + } else if (x === 0xe5_e5) { + i += err(x) + } else { + encode(x) + } + } + } + } else { + const long = + enc === 'big5' + ? (x) => { + const e = map[x] + if (e & 0xff_00) { + u8[i++] = e >> 8 + u8[i++] = e & 0xff + } else if (e || x === 0) { + u8[i++] = e + } else { + i += err(x) + } + } + : (x) => { + i += err(x) + } + + for (let j = i; j < length; j++) { + const x = str.charCodeAt(j) + if (x >= 0xd8_00 && x < 0xe0_00) { + if (x >= 0xdc_00 || j + 1 === length) { + i += err(x) // lone + } else { + const x1 = str.charCodeAt(j + 1) + if (x1 < 0xdc_00 || x1 >= 0xe0_00) { + i += err(x) // lone + } else { + j++ // consume x1 + long(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10))) + } + } + } else { + const e = map[x] + if (e & 0xff_00) { + u8[i++] = e >> 8 + u8[i++] = e & 0xff + } else if (e || x === 0) { + u8[i++] = e + } else { + i += err(x) + } + } + } + } + + return i === u8.length ? u8 : u8.subarray(0, i) + } +} diff --git a/fallback/multi-byte.table.js b/fallback/multi-byte.table.js index 1391eb0..967ff68 100644 --- a/fallback/multi-byte.table.js +++ b/fallback/multi-byte.table.js @@ -104,6 +104,7 @@ export function getTable(id) { res = new Array(sizes[id]) // array of strings or undefined unwrap(res, indices[id], 0, true) // Pointer code updates are embedded into the table + // These are skipped in encoder as encoder uses only pointers >= (0xA1 - 0x81) * 157 res[1133] = '\xCA\u0304' res[1135] = '\xCA\u030C' res[1164] = '\xEA\u0304' diff --git a/multi-byte.js b/multi-byte.js index a6ebd19..c7837da 100644 --- a/multi-byte.js +++ b/multi-byte.js @@ -1,5 +1,5 @@ import { assertUint8 } from './assert.js' -import { multibyteDecoder } from './fallback/multi-byte.js' +import { multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js' export function createMultibyteDecoder(encoding, loose = false) { const jsDecoder = multibyteDecoder(encoding, loose) // asserts @@ -11,3 +11,9 @@ export function createMultibyteDecoder(encoding, loose = false) { return jsDecoder(arr, stream) } } + +export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) { + // TODO: replacement, truncate (replacement will need varying length) + if (mode !== 'fatal') throw new Error('Unsupported mode') + return multibyteEncoder(encoding) // asserts +} diff --git a/multi-byte.node.js b/multi-byte.node.js index 1cdb28d..c8a6e1b 100644 --- a/multi-byte.node.js +++ b/multi-byte.node.js @@ -1,6 +1,6 @@ import { assertUint8 } from './assert.js' import { isDeno, toBuf } from './fallback/_utils.js' -import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js' +import { isAsciiSuperset, multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js' import { isAscii } from 'node:buffer' export function createMultibyteDecoder(encoding, loose = false) { @@ -21,3 +21,9 @@ export function createMultibyteDecoder(encoding, loose = false) { return jsDecoder(arr, stream) } } + +export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) { + // TODO: replacement, truncate (replacement will need varying length) + if (mode !== 'fatal') throw new Error('Unsupported mode') + return multibyteEncoder(encoding) // asserts +} diff --git a/tests/multi-byte.encode.test.js b/tests/multi-byte.encode.test.js new file mode 100644 index 0000000..ea9627f --- /dev/null +++ b/tests/multi-byte.encode.test.js @@ -0,0 +1,268 @@ +import { createMultibyteDecoder, createMultibyteEncoder } from '@exodus/bytes/multi-byte.js' +import { test, describe } from 'node:test' +import { readFileSync } from 'node:fs' +import { join } from 'node:path' + +describe('multi-byte encodings are supersets of ascii', () => { + // Except iso-2022-jp + for (const encoding of ['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030']) { + test(encoding, (t) => { + const decoder = createMultibyteDecoder(encoding) + const encoder = createMultibyteEncoder(encoding) + for (let i = 0; i < 128; i++) { + let str + try { + str = decoder(Uint8Array.of(i)) + } catch (cause) { + throw new Error(`Error decoding ${i} in ${encoding}`, { cause }) + } + + t.assert.strictEqual(str.length, 1, i) + t.assert.strictEqual(str.codePointAt(0), i, i) + + t.assert.deepStrictEqual(encoder(str), Uint8Array.of(i)) + } + }) + } +}) + +// https://encoding.spec.whatwg.org/#gb18030-encoder step 5 +const gbExceptions = { + E78D: Uint8Array.of(0xa6, 0xd9), + E78E: Uint8Array.of(0xa6, 0xda), + E78F: Uint8Array.of(0xa6, 0xdb), + E790: Uint8Array.of(0xa6, 0xdc), + E791: Uint8Array.of(0xa6, 0xdd), + E792: Uint8Array.of(0xa6, 0xde), + E793: Uint8Array.of(0xa6, 0xdf), + E794: Uint8Array.of(0xa6, 0xec), + E795: Uint8Array.of(0xa6, 0xed), + E796: Uint8Array.of(0xa6, 0xf3), + E81E: Uint8Array.of(0xfe, 0x59), + E826: Uint8Array.of(0xfe, 0x61), + E82B: Uint8Array.of(0xfe, 0x66), + E82C: Uint8Array.of(0xfe, 0x67), + E832: Uint8Array.of(0xfe, 0x6d), + E843: Uint8Array.of(0xfe, 0x7e), + E854: Uint8Array.of(0xfe, 0x90), + E864: Uint8Array.of(0xfe, 0xa0), +} + +describe('specific tests', () => { + test('big5', (t) => { + const enc = createMultibyteEncoder('big5') + const dec = createMultibyteDecoder('big5') + + // https://encoding.spec.whatwg.org/#index-big5-pointer + // If codePoint is U+2550 (═), U+255E (╞), U+2561 (╡), U+256A (╪), U+5341 (十), or U+5345 (卅), + // then return the last pointer corresponding to codePoint in index. + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa4)), String.fromCodePoint(0x25_50)) // 5247 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xf9)), String.fromCodePoint(0x25_50)) // 18991 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_50)), Uint8Array.of(0xf9, 0xf9)) // 18991 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa5)), String.fromCodePoint(0x25_5e)) // 5248 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xe9)), String.fromCodePoint(0x25_5e)) // 18975 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_5e)), Uint8Array.of(0xf9, 0xe9)) // 18975 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa7)), String.fromCodePoint(0x25_61)) // 5250 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xeb)), String.fromCodePoint(0x25_61)) // 18977 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_61)), Uint8Array.of(0xf9, 0xeb)) // 18977 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xa6)), String.fromCodePoint(0x25_6a)) // 5249 + t.assert.strictEqual(dec(Uint8Array.of(0xf9, 0xea)), String.fromCodePoint(0x25_6a)) // 18976 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x25_6a)), Uint8Array.of(0xf9, 0xea)) // 18976 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xcc)), String.fromCodePoint(0x53_41)) // 5287 + t.assert.strictEqual(dec(Uint8Array.of(0xa4, 0x51)), String.fromCodePoint(0x53_41)) // 5512 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x53_41)), Uint8Array.of(0xa4, 0x51)) // 5512 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xce)), String.fromCodePoint(0x53_45)) // 5289 + t.assert.strictEqual(dec(Uint8Array.of(0xa4, 0xca)), String.fromCodePoint(0x53_45)) // 5599 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x53_45)), Uint8Array.of(0xa4, 0xca)) // 5599 + + // But not others, which return first codepoint in index + t.assert.strictEqual(dec(Uint8Array.of(0xa1, 0xb2)), String.fromCodePoint(0x30_03)) // 5104 + t.assert.strictEqual(dec(Uint8Array.of(0xc6, 0xde)), String.fromCodePoint(0x30_03)) // 10957 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x30_03)), Uint8Array.of(0xa1, 0xb2)) // 5104 + + t.assert.strictEqual(dec(Uint8Array.of(0xa2, 0xcd)), String.fromCodePoint(0x53_44)) // 5288 + t.assert.strictEqual(dec(Uint8Array.of(0xfa, 0xc5)), String.fromCodePoint(0x53_44)) // 19096 + t.assert.deepStrictEqual(enc(String.fromCodePoint(0x53_44)), Uint8Array.of(0xa2, 0xcd)) // 5288 + }) + + test('shift_jis', (t) => { + const enc = createMultibyteEncoder('shift_jis') + const dec = createMultibyteDecoder('shift_jis') + + // https://encoding.spec.whatwg.org/#shift_jis-encoder + t.assert.deepStrictEqual(enc('\u007F'), Uint8Array.of(0x7f)) + t.assert.deepStrictEqual(enc('\u0080'), Uint8Array.of(0x80)) // If codePoint is an ASCII code point or U+0080, then return a byte whose value is codePoint. + t.assert.deepStrictEqual(enc('\u00A5'), Uint8Array.of(0x5c)) // If codePoint is U+00A5 (¥), then return byte 0x5C. + t.assert.deepStrictEqual(enc('\u203E'), Uint8Array.of(0x7e)) // If codePoint is U+203E (‾), then return byte 0x7E. + t.assert.deepStrictEqual(enc('\u2212'), enc('\uFF0D')) // If codePoint is U+2212 (−), then set it to U+FF0D (-). + t.assert.strictEqual(dec(enc('\uFF0D')), '\uFF0D') + t.assert.strictEqual(dec(enc('\u2212')), '\uFF0D') + + for (let i = 0xff_61; i <= 0xff_9f; i++) { + const str = String.fromCodePoint(i) + t.assert.deepStrictEqual(enc(str), Uint8Array.of(i - 0xff_61 + 0xa1)) + t.assert.strictEqual(dec(enc(str)), str) + } + }) + + test('euc-jp', (t) => { + const enc = createMultibyteEncoder('euc-jp') + const dec = createMultibyteDecoder('euc-jp') + + // https://encoding.spec.whatwg.org/#euc-jp-encoder + t.assert.deepStrictEqual(enc('\u007F'), Uint8Array.of(0x7f)) + t.assert.throws(() => enc('\u0080')) + t.assert.deepStrictEqual(enc('\u00A5'), Uint8Array.of(0x5c)) // If codePoint is U+00A5 (¥), then return byte 0x5C. + t.assert.deepStrictEqual(enc('\u203E'), Uint8Array.of(0x7e)) // If codePoint is U+203E (‾), then return byte 0x7E. + t.assert.deepStrictEqual(enc('\u2212'), enc('\uFF0D')) // If codePoint is U+2212 (−), then set it to U+FF0D (-). + t.assert.strictEqual(dec(enc('\uFF0D')), '\uFF0D') + t.assert.strictEqual(dec(enc('\u2212')), '\uFF0D') + for (let i = 0xff_61; i <= 0xff_9f; i++) { + const str = String.fromCodePoint(i) + t.assert.deepStrictEqual(enc(str), Uint8Array.of(0x8e, i - 0xff_61 + 0xa1)) + t.assert.strictEqual(dec(enc(str)), str) + } + }) + + test('euc-kr', (t) => { + const enc = createMultibyteEncoder('euc-kr') + + // https://encoding.spec.whatwg.org/#euc-kr-encoder + t.assert.deepStrictEqual(enc('\u007F'), Uint8Array.of(0x7f)) + t.assert.throws(() => enc('\u0080')) + }) + + test('gb18030, gbk', (t) => { + // gb18030 can encode replacement + t.assert.throws(() => createMultibyteEncoder('gbk')('\uFFFD')) // gbk can't encode it + const rep = createMultibyteEncoder('gb18030')('\uFFFD') + t.assert.strictEqual(createMultibyteDecoder('gb18030')(rep), '\uFFFD') + t.assert.deepStrictEqual(rep, Uint8Array.of(0x84, 0x31, 0xa4, 0x37)) // pointer 39417, valid representation for the replacement char + + // https://encoding.spec.whatwg.org/#gb18030-encoder + // 3. If codePoint is U+E5E5, then return error with codePoint. + t.assert.throws(() => createMultibyteEncoder('gbk')('\uE5E5')) // not present in index so doesn't need special handling + t.assert.throws(() => createMultibyteEncoder('gb18030')('\uE5E5')) // excluded from ranges via a specific check + + // gbk and gb18030 encode U+20AC differently, but decode both variants + // https://encoding.spec.whatwg.org/#gb18030-encoder + // 4. If is GBK is true and codePoint is U+20AC (€), then return byte 0x80. + t.assert.deepStrictEqual(createMultibyteEncoder('gb18030')('\u20AC'), Uint8Array.of(0xa2, 0xe3)) + t.assert.deepStrictEqual(createMultibyteEncoder('gbk')('\u20AC'), Uint8Array.of(0x80)) + t.assert.strictEqual(createMultibyteDecoder('gb18030')(Uint8Array.of(0xa2, 0xe3)), '\u20AC') + t.assert.strictEqual(createMultibyteDecoder('gb18030')(Uint8Array.of(0x80)), '\u20AC') + t.assert.strictEqual(createMultibyteDecoder('gbk')(Uint8Array.of(0xa2, 0xe3)), '\u20AC') + t.assert.strictEqual(createMultibyteDecoder('gbk')(Uint8Array.of(0x80)), '\u20AC') + + for (const encoding of ['gb18030', 'gbk']) { + const enc = createMultibyteEncoder(encoding) + for (const [hex, u8] of Object.entries(gbExceptions)) { + t.assert.doesNotThrow( + () => t.assert.deepStrictEqual(enc(String.fromCodePoint(parseInt(hex, 16))), u8), + `${encoding}(U+${hex})` + ) + } + } + }) +}) + +function loadTable(encoding, t) { + const text = readFileSync( + join(import.meta.dirname, 'encoding/fixtures/multi-byte', `index-${encoding}.txt`), + 'utf8' + ) + + const rows = text + .split('\n') + .map((x) => x.trim()) + .filter((x) => x && x[0] !== '#') + .map((x) => x.split('\t')) + .map(([istr, codeHex, description]) => { + const i = Number(istr) + const code = parseInt(codeHex.slice(2), 16) + t.assert.strictEqual(`${i}`, istr) + t.assert.strictEqual('0x' + code.toString(16).padStart(4, '0').toUpperCase(), codeHex) + return { i, code, description } + }) + + t.assert.strictEqual(rows.length, new Set(rows.map((row) => row.i)).size) // all unique + return rows +} + +describe('roundtrip, tables', () => { + const encodings = { + big5: 'big5', + shift_jis: 'jis0208', + 'euc-jp': 'jis0208', + 'euc-kr': 'euc-kr', + gbk: 'gb18030', + gb18030: 'gb18030', + } + + for (const [encoding, tableID] of Object.entries(encodings)) { + test(encoding, (t) => { + const enc = createMultibyteEncoder(encoding) + const dec = createMultibyteDecoder(encoding) + const table = loadTable(tableID, t) + const last = new Map(table.map(({ i, code }) => [code, i])) + for (const { i, code, description } of table) { + const str = String.fromCodePoint(code) + + // https://encoding.spec.whatwg.org/#index-big5-pointer excludes low pointers + if (encoding === 'big5' && i < (0xa1 - 0x81) * 157) { + // If last seen with that code is in low pointer range, it should throw + if (last.get(code) === i) t.assert.throws(() => enc(str), description) + continue + } + + t.assert.doesNotThrow(() => t.assert.strictEqual(dec(enc(str)), str), description) + } + }) + } +}) + +const slowEngine = + process.env.EXODUS_TEST_PLATFORM === 'quickjs' || + process.env.EXODUS_TEST_PLATFORM === 'xs' || + process.env.EXODUS_TEST_PLATFORM === 'engine262' +describe('roundtrip, full Unicode', () => { + const MAX = slowEngine ? 0x1_ff_ff : 0x10_ff_ff // Max Unicode codepoint + + test('gb18030', { timeout: 60_000 }, (t) => { + const enc = createMultibyteEncoder('gb18030') + const dec = createMultibyteDecoder('gb18030') + + for (let i = 0; i <= MAX; i++) { + const s = String.fromCodePoint(i) + const id = `U+${i.toString(16).toUpperCase()}` + if (i >= 0xd8_00 && i <= 0xdf_ff) { + // Surrogates + t.assert.throws(() => enc(s), `Surrogate ${id}`) + continue + } + + // https://encoding.spec.whatwg.org/#gb18030-encoder step 3. If codePoint is U+E5E5, then return error with codePoint. + if (i === 0xe5_e5) { + t.assert.throws(() => enc(s), id) + continue + } + + let u8 + t.assert.doesNotThrow(() => { + u8 = enc(s) + }, id) + + if (Object.hasOwn(gbExceptions, i.toString(16).toUpperCase())) { + t.assert.deepStrictEqual(u8, gbExceptions[i.toString(16).toUpperCase()], id) + } else { + t.assert.strictEqual(dec(u8), s, id) + } + } + }) +}) diff --git a/tests/wpt/loader.cjs b/tests/wpt/loader.cjs index 36d94e0..e8271ba 100644 --- a/tests/wpt/loader.cjs +++ b/tests/wpt/loader.cjs @@ -2,6 +2,7 @@ const assert = require('node:assert/strict') const fs = require('node:fs') const path = require('node:path') const { describe, test } = require('node:test') +const { createMultibyteEncoder } = require('@exodus/bytes/multi-byte.js') // TextDecoderStream / TextEncoderStream implementations expect Streams to be present if (!globalThis.ReadableStream) { @@ -146,6 +147,8 @@ function loadTextDecoderHtml(fullName) { assert.ok(encoding && encoding.length > 0) const decoder = new globalThis.TextDecoder(encoding) const fatal = new globalThis.TextDecoder(encoding, { fatal: true }) + const encode = + decoder.encoding === 'iso-2022-jp' ? null : createMultibyteEncoder(decoder.encoding) // TODO: iso-2022-jp if (fullName.endsWith('_errors.html')) { const sep0 = '' @@ -216,6 +219,19 @@ function loadTextDecoderHtml(fullName) { t.assert.strictEqual(fatal.decode(bytes), expected, `${bytesHex} => U+${cpHex}`) } + // Test encoder + // This is limited, encoders are asymmetrical + if ( + !(decoder.encoding === 'euc-jp' && bytes.length === 3) && // no jis0212 encoding in spec + !(decoder.encoding === 'big5' && bytes[0] > 0x7f && bytes[0] <= 0xa0) && // encoding excludes pointers less than (0xA1 - 0x81) × 157. + decoder.encoding !== 'iso-2022-jp' // Not implemented yet + ) { + t.assert.doesNotThrow( + () => t.assert.deepEqual(encode(String.fromCodePoint(cp)), bytes), + `encode U+${cpHex} => ${bytesHex}` + ) + } + tested++ }