diff --git a/README.md b/README.md index e7972af..37fe9c1 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,60 @@ [![Build Status](https://travis-ci.org/Floby/node-tokenizer.png)](https://travis-ci.org/Floby/node-tokenizer) # Synopsis -A wide purpose tokenizer for JavaScript. The interface follows more or less -the WriteStream from [node.js](http://nodejs.org). +A wide purpose tokenizer for JavaScript that tokenizes based on rules established using Regular Expressions. The interface conforms to the WriteStream from [node.js](http://nodejs.org). -node-tokenizer is published on npm so you can install it with `npm install tokenizer` +# Installation + + npm i tokenizer ## How to -* require the Tokenizer constructor +**Requiring** ``` javascript var Tokenizer = require('tokenizer'); ``` -* construct one (we'll see what the callback is used for) +**Construction** ``` javascript -var t = new Tokenizer(mycallback); +var t = new Tokenizer(mycallback, options); ``` -* add rules +**Setting Options** + +Options is an object passed to the constructor function and can contain the following properties (defaults shown inline): + + { + stepSize: 0, // For large streams, the maximum size that will be tokenized at a time. This must be larger than the largest expected token. + split: undefined // A regular expression. See explanation in 'Splitting into Smaller Pieces' + } + +**Adding Rules** ``` javascript t.addRule(/^my regex$/, 'type'); ``` -* write or pump to it +**Splitting into Smaller Pieces** + +By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of regular expression boundary (like /\n/) you can specify to split your input by that before tokenization which could improve performance dramatically. + +``` javascript +// Break CSV into subportions and tokenize each subportion separately but in order of original input +t = new Tokenizer(undefined, { + split: /\,/ +}); +``` + +``` javascript +// Break file up by lines and tokenize each line separately. +t = new Tokenizer(undefined, { + split: /\r?\n/ +}); +``` + +**Writing/Piping** ``` javascript t.write(data); @@ -34,18 +62,18 @@ t.write(data); stream.pipe(t); ``` -* listen for new tokens +**Listen for tokens** ``` javascript t.on('token', function(token, type) { // do something useful // type is the type of the token (specified with addRule) // token is the actual matching string -}) +}); // alternatively you can use the tokenizer as a readable stream. ``` -* look out for the end +**Listening for completion** ``` javascript t.on('end', callback); @@ -63,24 +91,32 @@ and match, an object like this } ``` -Have a look in the example folder +##Examples + +Take a look a the [examples](https://github.com/Floby/node-tokenizer/tree/master/examples) folder. ## Rules -rules are regular expressions associated with a type name. + +Rules are regular expressions associated with a type name. + The tokenizer tries to find the longest string matching one or more rules. When several rules match the same string, priority is given to the rule -which was added first. (this may change) +which was added first. -Please note that your regular expressions should use ^ and $ in order +Note: normally your regular expressions should use ^ and $ in order to test the whole string. If these are not used, you rule will match _every_ string that contains what you specified, this could be the whole file! ## To do -* a lot of optimisation -* being able to share rules across several tokenizers - (although this can be achieved through inheritance) -* probably more hooks -* more checking + +* Continued optimisation +* Rule sharing across several tokenizers (although this can be achieved through inheritance) +* Need more hooks +* Increase test coverage + +## Testing + +Testing is provided via the ## License diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 01dc3d2..1846045 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -1,99 +1,158 @@ -var EventEmitter = require('events').EventEmitter; -var util = require('util'); -var assert = require('assert'); -var Transform = require('stream').Transform; -var disect = require('disect'); +// Using a fork of npm tokenizer by JFloby. (c) JFloby with modifications by me. + +var EventEmitter = require('events').EventEmitter, + util = require('util'), + assert = require('assert'), + Transform = require('stream').Transform; function noop(){} -function Tokenizer (check_token_cb, options) { - if(!(this instanceof Tokenizer)) { - return new Tokenizer(check_token_cb); - } +function Tokenizer (check_token_cb, options, error_cb) { + if(!(this instanceof Tokenizer)) { + return new Tokenizer(check_token_cb, options); + } + + this.options = options || {}; + this.options.stepSize = this.options.hasOwnProperty('stepSize') ? this.options.stepSize : 0; + + Transform.call(this, options); - Transform.call(this, options); - this._readableState.objectMode = true; - this._buffered = ""; // we buffer untokenized data between writes - this._regexes = []; // should contain objects - // with regex[RegExp] and type[String] - this._ignored = {}; // a hash of ignored token types - // these will be parsed but not emitted - this._checkToken = check_token_cb || noop; + this._readableState.objectMode = true; + this._buffered = ''; // we buffer untokenized data between writes + this._regexes = []; // should contain objects with regex[RegExp] and type[String] + this._ignored = {}; // a hash of ignored token types these will be parsed but not emitted + this._checkToken = check_token_cb || noop; + this._error = error_cb; } + util.inherits(Tokenizer, Transform); Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) { chunk = chunk.toString(); var self = this; + process.nextTick(function () { - try { - var index = 0, step = 64; - while(index < chunk.length) { + var index = 0, + step = self.options.stepSize; + + if (self.options.stepSize > 0) + { + while (index < chunk.length) { self._tokenize(chunk.substr(index, step)); index += step; } - callback(); - } catch(e) { - callback(e); } + else self._tokenize(chunk); + + callback(undefined, chunk); }) }; -Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) { - for (var i = 0; i < this._regexes.length; ++i) { - if(this._regexes[i].regex.test(str)) { - return this._regexes[i]; +Tokenizer.prototype._getLongestMatch = function _getMatchingRule(str) { + var bestMatch = undefined, + longestMatchLen = 0; + + // Find the longest match that matches at the beginning of the string. + for (var i = 0; i < this._regexes.length; i++) + { + if (this._regexes[i].filter && !this._regexes[i].filter(str)) + continue; + + var match = undefined, + matches = str.match(this._regexes[i].regex); + + if (matches && matches.length) + { + if ((match = matches[0]).length > longestMatchLen) + { + longestMatchLen = match.length; + bestMatch = { + rule: this._regexes[i], + match: match, + length: match.length, + matchesAll: longestMatchLen == str.length + }; + + if (longestMatchLen == str.length) + break; } + } } - return null; + + return bestMatch; }; -Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) { - var regexes = this._regexes; - // in case we buffered data on previous writes - data = this._buffered + data; - this._buffered = ''; - if(!data.length) { - return; +Tokenizer.prototype._firstMatchLength = function(str, regex) { + for (var i = 1; i < str.length; i++) + if (regex.test(str.substr(0, i))) + return i; + return -1; +} + +Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { + // Did we buffered data on previous writes? + data = this._buffered + data; + this._buffered = ''; + + while (data && data.length) + { + var match = undefined, + str = undefined, + ix = -1, + removeEOL = false; + + if (this.options.split) { + while ((ix = data.search(this.options.split)) == 0) + { + var len = this._firstMatchLength(data, this.options.split); + + if (len != -1) + { + this.emit('split', data.substr(0, len)); + + data = data.substr(len); + } + else return; + } + + if (ix != -1) + removeEOL = true; + str = ix != -1 ? data.substr(0, ix) + '\n' : data; + data = ix != -1 ? data.substr(ix) : undefined; } + else { + str = data; + data = undefined; + } + + match = this._getLongestMatch(str); - var self = this; - var maxIndex = disect(0, data.length, function (index) { - var buf = data.substring(0, index + 1); - return self._getMatchingRule(buf) === null; - }); + if (!match) { + var err = new SyntaxError('No rules found to match any part of \'' + str.toString() + '\''); - if(maxIndex === 0) { - // no match found - throw new SyntaxError('could not tokenize ' + JSON.stringify(data)); + if (this._error) + this._error(err); + else + throw err; } - else if (maxIndex === data.length && !nobuffer) { - // the whole string is matching - this._buffered = data; + else if (match.matchesAll && !endofstream && (!data || !data.length)) { + this._buffered = str; return; } - else { - // some substring is matching - var str = data.substring(0, maxIndex); - var rule = this._getMatchingRule(str); - if(!rule) { - throw new Error('wut ?'); - } - this._gotToken(str, rule); - this._tokenize(data.substring(maxIndex), nobuffer); - } + + if (removeEOL) + str = str.substr(0, str.length - 1); + + data = str.substr(match.length) + (data || ''); + str = str.substr(0, match.length); + + this._gotToken(str, match.rule); + } // while }; Tokenizer.prototype._flush = function _flush(callback) { - var self = this; - process.nextTick(function () { - try { - self._tokenize('', true); - callback(); - } catch(e) { - callback(e); - } - }); + this._tokenize('', true); + callback(); }; var Token = function String (content, type) { @@ -109,50 +168,50 @@ Token.prototype.valueOf = function valueOf() { }; Tokenizer.prototype._gotToken = function _gotToken(str, rule) { - // notify the token checker - var type = this._checkToken(str, rule) || rule.type; - if(this._ignored[type]) return; - var token = new Token(str, type); + // notify the token checker + var type = rule.type || this._checkToken(str, rule); + if(this._ignored[type]) return; + var token = new Token(str, type); - this.push(token); + this.push(token); - this.emit('token', token, type); + this.emit('token', token, type); }; -Tokenizer.prototype.addRule = function addRule(regex, type) { - // this is useful for built-in rules - if(!type) { - if(Array.isArray(regex)) { - return this.addRule(regex[0], regex[1]); - } - else if(regex) { - return this.addRule(Tokenizer[regex]); - } - else { - throw new Error('No parameters specified'); - } +Tokenizer.prototype.addRule = function addRule(regex, type, filter) { + // this is useful for built-in rules + if(!type) { + if(Array.isArray(regex)) { + return this.addRule(regex[0], regex[1], filter); + } + else if(regex) { + return this.addRule(Tokenizer[regex], filter); + } + else { + throw new Error('No parameters specified'); } - assert.ok((regex instanceof RegExp) || (typeof regex === 'function')); - assert.equal(typeof type, 'string'); - this._regexes.push({regex:regex,type:type}); + } + assert.ok((regex instanceof RegExp) || (typeof regex === 'function')); + assert.equal(typeof type, 'string'); + this._regexes.push({ + regex:regex, + type:type, + filter: filter + }); }; /** * set some tokens to be ignored. these won't be emitted */ Tokenizer.prototype.ignore = function ignore(ignored) { - if(Array.isArray(ignored)) { - for (var i = 0; i < ignored.length; ++i) { - this.ignore(ignored[i]); - } - return; - } - this._ignored[ignored] = true; + if (ignored instanceof Array) + return ignored.forEach(this.ignore.bind(this)); + this._ignored[ignored] = true; }; module.exports = Tokenizer; // built-in rules -Tokenizer.whitespace = [/^(\s)+$/, 'whitespace']; -Tokenizer.word = [/^\w+$/, 'word']; -Tokenizer.number = [/^\d+(\.\d+)?$/, 'number']; +Tokenizer.whitespace = [/^(\s)+/, 'whitespace']; +Tokenizer.word = [/^\w+/, 'word']; +Tokenizer.number = [/^\d+(\.\d+)?/, 'number']; \ No newline at end of file diff --git a/package.json b/package.json index ea984c2..a47cfe0 100644 --- a/package.json +++ b/package.json @@ -1,13 +1,15 @@ { "name": "tokenizer", - "description": "A wide purpose tokenizer for node.js which looks like a stream", - "version": "1.1.2", + "description": "A wide purpose tokenizer for node.js which extends the built-in 'stream' module.", + "version": "1.2.0", "homepage": "http://github.com/floby/node-tokenizer", "repository": { "type": "git", "url": "git://github.com/Floby/node-tokenizer.git" }, "author": "Florent Jaby ", + "contributors": [], + "main": "lib/Tokenizer.js", "scripts": { "test": "nodeunit test/test-tokenizer.js" @@ -20,8 +22,5 @@ }, "devDependencies": { "nodeunit": "~0.8.1" - }, - "dependencies": { - "disect": "~1.1.0" } } diff --git a/test/test-perf.js b/test/test-perf.js index 219cf4e..0f1bb61 100644 --- a/test/test-perf.js +++ b/test/test-perf.js @@ -1,10 +1,11 @@ -var tokenizer = require('../'); -var domain = require('domain'); +var tokenizer = require('../'), + domain = require('domain'); Function.prototype.withDomain = function(withStack) { var fn = this; return function(test) { var d = domain.create(); + d.on('error', function(e) { test.fail('test failed with ' + e.message); if(withStack) { @@ -12,6 +13,7 @@ Function.prototype.withDomain = function(withStack) { } test.done(); }); + d.run(fn.bind(this, test)); } } @@ -42,20 +44,17 @@ Function.prototype.timed = function (timeout) { } } - - exports['test big file of small integers'] = function (test) { var numbers = [0]; for (var i = 0; i < 100000; ++i) { - numbers.push(Math.floor(Math.random() * 10000)); + numbers.push(Math.floor(Math.random() * 100000)); }; - var t = tokenizer(); + var t = tokenizer(undefined, {split: /\,/}); t.addRule('number'); t.addRule(/^\d+\.$/, 'maybe-float'); t.addRule('whitespace'); t.addRule(/^,$/, 'comma'); t.ignore('whitespace'); - t.ignore('comma'); t.on('data', function(token) { }); t.on('end', test.done.bind(test)); diff --git a/test/test-tokenizer.js b/test/test-tokenizer.js index 5e3e3c3..d708a15 100644 --- a/test/test-tokenizer.js +++ b/test/test-tokenizer.js @@ -16,7 +16,6 @@ Function.prototype.withDomain = function(withStack) { } } - exports['test empty'] = function(test) { var t = tokenizer(); t.on('data', test.fail.bind(test, "No data should be emitted")); @@ -174,3 +173,26 @@ exports['words in two chunks'] = function(test) { t.write('Hell'); t.end('o World'); }.withDomain(); + +exports['verify regex priority order and that longest matches first'] = function(test) { + //Test case built for a tokenizer I was building that was supposed to parse SLIM template code but was not working. + var t = tokenizer(undefined, {split: /^\r?\n+$/}); + t.addRule(/^([a-zA-Z0-9\-_]+\s*=\s*)(["'])(\\\2|[^"']+)*?\2$/, 'tKeyValue'); // name='value' + t.addRule(/^[a-zA-Z0-9\-_]+$/, 'tIdentifier'); // name + t.addRule(/^[#][a-zA-Z0-9\-_]+$/, 'tIdName'); // #name + t.addRule(/^\.[a-zA-Z0-9\-_]+$/, 'tClassName'); // .name + t.addRule('whitespace'); + t.ignore('whitespace'); + + var expectations = ['tIdentifier', 'tIdName', 'tClassName', 'tKeyValue', 'tKeyValue']; + + t.on('data', function(token) { + var e = expectations.shift(); + + test.equal(e, token.type); + }); + + t.on('end', test.done.bind(test)); + t.write('tag#id.class var1 = \'value1\' var2 = \'value2\''); + t.end(); +}.withDomain(); \ No newline at end of file