Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 216 additions & 2 deletions fallback/multi-byte.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js'
import { E_STRING } from './_utils.js'
import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2, encodeAscii } from './latin1.js'
import { getTable } from './multi-byte.table.js'

export const E_STRICT = 'Input is not well-formed for this encoding'

// TODO: optimize
/* Decoders */

// If the decoder is not cleared properly, state can be preserved between non-streaming calls!
// See comment about fatal stream
Expand Down Expand Up @@ -504,3 +505,216 @@ export function multibyteDecoder(enc, loose = false) {
return res + mapper.decode(arr, res.length, arr.length, stream)
}
}

/* Encoders */

const maps = new Map()
const e7 = [[148, 236], [149, 237], [150, 243]] // prettier-ignore
const e8 = [[30, 89], [38, 97], [43, 102], [44, 103], [50, 109], [67, 126], [84, 144], [100, 160]] // prettier-ignore
const preencoders = {
__proto__: null,
big5: (p) => ((((p / 157) | 0) + 0x81) << 8) | ((p % 157 < 0x3f ? 0x40 : 0x62) + (p % 157)),
shift_jis: (p) => {
const l = (p / 188) | 0
const t = p % 188
return ((l + (l < 0x1f ? 0x81 : 0xc1)) << 8) | ((t < 0x3f ? 0x40 : 0x41) + t)
},
'euc-jp': (p) => ((((p / 94) | 0) + 0xa1) << 8) | ((p % 94) + 0xa1),
'euc-kr': (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190) + 0x41),
gb18030: (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190 < 0x3f ? 0x40 : 0x41) + (p % 190)),
}

preencoders.gbk = preencoders.gb18030

// We accept that encoders use non-trivial amount of mem, for perf
// most are are 128 KiB mem, big5 is 380 KiB, lazy-loaded at first use
function getMap(id, size) {
const cached = maps.get(id)
if (cached) return cached
let tname = id
const sjis = id === 'shift_jis'
if (id === 'gbk') tname = 'gb18030'
if (id === 'euc-jp' || sjis) tname = 'jis0208'
const table = getTable(tname)
const map = new Uint16Array(size)
const enc = preencoders[id] || ((p) => p + 1)
for (let i = 0; i < table.length; i++) {
const c = table[i]
if (c === REP || c === undefined) continue
if (id === 'big5') {
if (i < 5024) continue // this also skips multi-codepoint strings
// In big5, all return first entries except for these
if (
map[c] &&
c !== 0x25_50 &&
c !== 0x25_5e &&
c !== 0x25_61 &&
c !== 0x25_6a &&
c !== 0x53_41 &&
c !== 0x53_45
) {
continue
}
} else {
if (sjis && i >= 8272 && i <= 8835) continue
if (map[c]) continue
}

if (typeof c === 'string') {
// always a single codepoint here
map[c.codePointAt(0)] = enc(i)
} else if (c !== REP) {
map[c] = enc(i)
}
}

for (let i = 0; i < 0x80; i++) map[i] = i
if (sjis || id === 'euc-jp') {
if (sjis) map[0x80] = 0x80
const d = sjis ? 0xfe_c0 : 0x70_c0
for (let i = 0xff_61; i <= 0xff_9f; i++) map[i] = i - d
map[0x22_12] = map[0xff_0d]
map[0xa5] = 0x5c
map[0x20_3e] = 0x7e
} else if (tname === 'gb18030') {
if (id === 'gbk') map[0x20_ac] = 0x80
for (let i = 0xe7_8d; i <= 0xe7_93; i++) map[i] = i - 0x40_b4
for (const [a, b] of e7) map[0xe7_00 | a] = 0xa6_00 | b
for (const [a, b] of e8) map[0xe8_00 | a] = 0xfe_00 | b
}

maps.set(id, map)
return map
}

const encoders = new Set(['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030'])
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
let gb18030r

export function multibyteEncoder(enc, onError) {
if (!encoders.has(enc)) throw new RangeError('Unsupported encoding')
const size = enc === 'big5' ? 0x2_f8_a7 : 0x1_00_00 // for big5, max codepoint in table + 1
const width = enc === 'gb18030' ? 4 : 2
const map = getMap(enc, size)
if (enc === 'gb18030' && !gb18030r) gb18030r = getTable('gb18030-ranges')

return (str) => {
if (typeof str !== 'string') throw new TypeError(E_STRING)
if (!NON_LATIN.test(str)) {
try {
return encodeAscii(str, E_STRICT)
} catch {}
}

const length = str.length
const u8 = new Uint8Array(length * width)
let i = 0
while (i < length) {
const x = str.charCodeAt(i)
if (x >= 128) break
u8[i++] = x
}

// eslint-disable-next-line unicorn/consistent-function-scoping
const err = (code) => {
if (onError) return onError(code, u8, i)
throw new TypeError(E_STRICT)
}

if (!map || map.length < size) throw new Error('Unreachable') // Important for perf
if (enc === 'gb18030') {
// Deduping this branch hurts other encoders perf
const encode = (cp) => {
let a = 0, b = 0 // prettier-ignore
for (const [c, d] of gb18030r) {
if (d > cp) break
a = c
b = d
}

let rp = cp === 0xe7_c7 ? 7457 : a + cp - b
u8[i++] = 0x81 + ((rp / 12_600) | 0)
rp %= 12_600
u8[i++] = 0x30 + ((rp / 1260) | 0)
rp %= 1260
u8[i++] = 0x81 + ((rp / 10) | 0)
u8[i++] = 0x30 + (rp % 10)
}

for (let j = i; j < length; j++) {
const x = str.charCodeAt(j)
if (x >= 0xd8_00 && x < 0xe0_00) {
if (x >= 0xdc_00 || j + 1 === length) {
i += err(x) // lone
} else {
const x1 = str.charCodeAt(j + 1)
if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
i += err(x) // lone
} else {
j++ // consume x1
encode(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
}
}
} else {
const e = map[x]
if (e & 0xff_00) {
u8[i++] = e >> 8
u8[i++] = e & 0xff
} else if (e || x === 0) {
u8[i++] = e
} else if (x === 0xe5_e5) {
i += err(x)
} else {
encode(x)
}
}
}
} else {
const long =
enc === 'big5'
? (x) => {
const e = map[x]
if (e & 0xff_00) {
u8[i++] = e >> 8
u8[i++] = e & 0xff
} else if (e || x === 0) {
u8[i++] = e
} else {
i += err(x)
}
}
: (x) => {
i += err(x)
}

for (let j = i; j < length; j++) {
const x = str.charCodeAt(j)
if (x >= 0xd8_00 && x < 0xe0_00) {
if (x >= 0xdc_00 || j + 1 === length) {
i += err(x) // lone
} else {
const x1 = str.charCodeAt(j + 1)
if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
i += err(x) // lone
} else {
j++ // consume x1
long(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
}
}
} else {
const e = map[x]
if (e & 0xff_00) {
u8[i++] = e >> 8
u8[i++] = e & 0xff
} else if (e || x === 0) {
u8[i++] = e
} else {
i += err(x)
}
}
}
}

return i === u8.length ? u8 : u8.subarray(0, i)
}
}
1 change: 1 addition & 0 deletions fallback/multi-byte.table.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ export function getTable(id) {
res = new Array(sizes[id]) // array of strings or undefined
unwrap(res, indices[id], 0, true)
// Pointer code updates are embedded into the table
// These are skipped in encoder as encoder uses only pointers >= (0xA1 - 0x81) * 157
res[1133] = '\xCA\u0304'
res[1135] = '\xCA\u030C'
res[1164] = '\xEA\u0304'
Expand Down
8 changes: 7 additions & 1 deletion multi-byte.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { assertUint8 } from './assert.js'
import { multibyteDecoder } from './fallback/multi-byte.js'
import { multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js'

export function createMultibyteDecoder(encoding, loose = false) {
const jsDecoder = multibyteDecoder(encoding, loose) // asserts
Expand All @@ -11,3 +11,9 @@ export function createMultibyteDecoder(encoding, loose = false) {
return jsDecoder(arr, stream)
}
}

export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) {
// TODO: replacement, truncate (replacement will need varying length)
if (mode !== 'fatal') throw new Error('Unsupported mode')
return multibyteEncoder(encoding) // asserts
}
8 changes: 7 additions & 1 deletion multi-byte.node.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { assertUint8 } from './assert.js'
import { isDeno, toBuf } from './fallback/_utils.js'
import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js'
import { isAsciiSuperset, multibyteDecoder, multibyteEncoder } from './fallback/multi-byte.js'
import { isAscii } from 'node:buffer'

export function createMultibyteDecoder(encoding, loose = false) {
Expand All @@ -21,3 +21,9 @@ export function createMultibyteDecoder(encoding, loose = false) {
return jsDecoder(arr, stream)
}
}

export function createMultibyteEncoder(encoding, { mode = 'fatal' } = {}) {
// TODO: replacement, truncate (replacement will need varying length)
if (mode !== 'fatal') throw new Error('Unsupported mode')
return multibyteEncoder(encoding) // asserts
}
Loading
Loading