From 830f59b2f5fcf2a75351562f7100981f32020477 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Espen=20Raadim?= Date: Wed, 16 Jun 2021 23:52:40 +0200 Subject: [PATCH 1/2] Update parser.js 1. Add support for UTF16. Detect the encoding by looking for the byte order mark at the start of the incoming buffer and save this as encoding for the rest of the incoming chunks. (https://en.wikipedia.org/wiki/Byte_order_mark) - other encodings could probably also be detected. Note: the encoding for the incoming chunk will be "buffer", so no use of using this as encoding when converting to string. 2. trim the name of the opening tag end the closing tag. Failed corner case found when tag is written as "". the space between the tagname and end bracket '>' is emitted as the full tag name --- src/parser.js | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/parser.js b/src/parser.js index fa41366..272ccc9 100644 --- a/src/parser.js +++ b/src/parser.js @@ -40,11 +40,18 @@ export default class Parser extends Writable { this.state = STATE.TEXT; this.buffer = ''; this.pos = 0; + this.detectedEncoding = 'utf8'; this.tagType = TAG_TYPE.NONE; } _write(chunk, encoding, done) { - chunk = typeof chunk !== 'string' ? chunk.toString() : chunk; + + if(typeof chunk !== 'string' && this.pos == 0){ + if(chunk[0] == 0xff && chunk[1] == 0xfe) this.detectedEncoding = 'utf16le'; + if(chunk[0] == 0xfe && chunk[1] == 0xff) this.detectedEncoding = 'utf16be'; + } + + chunk = typeof chunk !== 'string' ? chunk.toString(this.detectedEncoding) : chunk; for (let i = 0; i < chunk.length; i++) { let c = chunk[i]; let prev = this.buffer[this.pos - 1]; @@ -108,10 +115,10 @@ export default class Parser extends Writable { let { name, attributes } = this._parseTagString(tag); if (this.tagType & TAG_TYPE.OPENING == TAG_TYPE.OPENING) { - this.emit(EVENTS.OPEN_TAG, name, attributes); + this.emit(EVENTS.OPEN_TAG, name.trim(), attributes); } if (this.tagType & TAG_TYPE.CLOSING == TAG_TYPE.CLOSING) { - this.emit(EVENTS.CLOSE_TAG, name, attributes); + this.emit(EVENTS.CLOSE_TAG, name.trim(), attributes); } this.isCloseTag = false; From 7854f3604c0c8f8c1de805dcfb0321f3300cdb09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Espen=20Raadim?= Date: Thu, 17 Jun 2021 08:31:31 +0200 Subject: [PATCH 2/2] Update parser.js Added proper support for utf16 big endian --- src/parser.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/parser.js b/src/parser.js index 272ccc9..afce389 100644 --- a/src/parser.js +++ b/src/parser.js @@ -50,8 +50,16 @@ export default class Parser extends Writable { if(chunk[0] == 0xff && chunk[1] == 0xfe) this.detectedEncoding = 'utf16le'; if(chunk[0] == 0xfe && chunk[1] == 0xff) this.detectedEncoding = 'utf16be'; } + let useEncoding = this.detectedEncoding; - chunk = typeof chunk !== 'string' ? chunk.toString(this.detectedEncoding) : chunk; + //Node does not have built in support for big endian. But we can use swap16 to swap byte pairs to a utf16le + if(typeof chunk !== 'string' && this.detectedEncoding === 'utf16be'){ + chunk.swap16(); + useEncoding = 'utf16le'; + } + + + chunk = typeof chunk !== 'string' ? chunk.toString(useEncoding) : chunk; for (let i = 0; i < chunk.length; i++) { let c = chunk[i]; let prev = this.buffer[this.pos - 1];