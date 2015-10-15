Javascript text tokenizer that is easy to use and compose.
$ npm install tokenize-text
var Tokenizer = require('tokenize-text');
var tokenize = new Tokenizer();
This is the main method of this module, all other methods are using it.
fn will be called with 4 arguments:
text: text value of the token (
text == currentToken.value)
currentToken: current token object
prevToken: precedent token (or null)
nextToken: next token (or null)
fn should return a string, an array of string, a token or an array of tokens.
tokenize.split(fn) returns a tokenizer function that accept a list of tokens or a string argument (it will be convert as one token).
The tokenizer function returns an array of tokens with the following properties:
value: text content of the token
index: absolute position in the original text
offset: length of the token (equivalent to
value.length)
// Simple tokenizer that split into 2 sections
var splitIn2 = tokenize.split(function(text, currentToken, prevToken, nextToken) {
return [
text.slice(0, text.length / 2),
text.slice(text.length / 2)
]
});
var tokens = splitIn2('hello');
/*
[
{ value: 'he', index: 0, offset: 2 },
{ value: 'llo', index: 2, offset: 3 }
]
*/
Tokenize using a regular expression:
var extractUppercase = tokenize.re(/[A-Z]/);
var tokens = extractUppercase('aBcD');
/*
[
{ value: 'B', index: 1, offset: 1 },
{ value: 'D', index: 3, offset: 1 }
]
*/
Tokenize and split as characters,
tokenize.characters() is equivalent to
tokenize.re(/[^\s]/).
var tokens = tokenize.characters()('abc');
/*
[
{ value: 'a', index: 0, offset: 1 },
{ value: 'b', index: 1, offset: 1 },
{ value: 'c', index: 2, offset: 1 }
]
*/
Split in sections, sections are split by
\n . , ; ! ?.
var tokens = tokenize.sections()('this is sentence 1. this is sentence 2');
/*
[
{
value: 'this is sentence 1',
index: 0,
offset: 18
},
{
value: ' this is sentence 2',
index: 19,
offset: 19
}
]
*/
Split in words:
var tokens = tokenize.words()('hello, how are you?');
/*
[
{ value: 'hello', index: 0, offset: 5 },
{ value: 'how', index: 7, offset: 3 },
{ value: 'are', index: 11, offset: 3 },
{ value: 'you', index: 15, offset: 3 }
]
*/
Filter the list of tokens by calling
fn(token):
// Filter the words to extract the ones that start with an uppercase
var extractNames = tokenize.filter(function(word, current, prev) {
return (prev && /[A-Z]/.test(word[0]));
});
// Split texts in words
var words = tokenize.words()('My name is Samy.');
// Apply the filter
var tokens = extractNames(words);
/*
[
{ value: 'Samy', index: 11, offset: 4 }
]
*/
Creates a tokenizer that returns the result of invoking the provided tokenizers for each input token.
var extractNames = tokenize.flow(
// Split text as words
tokenize.words(),
// Filter the words to extract the ones that start with an uppercase
tokenize.filter(function(word, current, prev) {
return (prev && /[A-Z]/.test(word[0]));
})
);
var tokens = extractNames('My name is Samy.');
To execute all tokenizer in series, you can use
tokenize.serie(fn1, fn2, [...]) instead.
Example to extract all repeated words in sentences:
var repeatedWords = tokenize.flow(
// Tokenize as sections
tokenize.sections(),
// For each sentence
tokenize.flow(
// Tokenize as words
tokenize.words(),
// Filter words to extract only repeated ones
tokenize.filter(function(word, token, prev) {
return (
prev &&
token.value.toLowerCase() === prev.value.toLowerCase()
);
})
)
);
var tokens = repeatedWords('This is great great. Great is an an awesome words');
/*
[
{ value: 'great', index: 14, offset: 5 },
{ value: 'an', index: 33, offset: 2 }
]
*/