I\'m trying to build JavaScript code that reads one string (say a sentence of English text), then outputs another string of (comma-separated) words that were \"uncommon\". S
the words you want to remove is called stop words witch is:
["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]
here is the source: http://99webtools.com/list-english-stop-words.php
so your code should be
function getNoneStopWords(sentence) {
var common = getStopWords();
var wordArr = sentence.match(/\w+/g),
commonObj = {},
uncommonArr = [],
word, i;
for (i = 0; i < common.length; i++) {
commonObj[ common[i].trim() ] = true;
}
for (i = 0; i < wordArr.length; i++) {
word = wordArr[i].trim().toLowerCase();
if (!commonObj[word]) {
uncommonArr.push(word);
}
}
return uncommonArr;
}
function getStopWords() {
return ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"];
}
Build an associative array of common words first, then tokenize sequence to output any words not contained in it. E.g.
var excluded = new Object();
common_words = common_words.split(",");
for (var i in common_words) {
excluded[common_words[i].trim().toLowerCase()] = true;
}
var result = new Array();
var match = sentence.match(/\w+/g);
for (var i in match) {
if (!excluded[match[i].toLowerCase()]) {
result.push(match[i]);
}
}
var uncommon_words = result.join(", ");
Here's a start, I reckon:
var sentence_arr = sentence.split(/(?=\w)\b|\W/);
var common_arr = common_words.split(', ');
var uncommon_arr = array();
for(var i = 0; i < sentence_arr.length; i++) {
for ( var j = 0; j < common_arr.length; j++ ) {
if ( sentence_arr[i].toLowerCase() != common_arr[j].toLowerCase() ) {
uncommon_arr.push(sentence_arr[i].toLowerCase());
}
}
var uncommon_words = uncommon_arr.join(', ');
completely untested, but the point is you split both sentences and individually check each word against each member of that list. Kinda naive, and totally doesn't scale, but would be fine with small examples such as this.
The String#diff function returns a list of differences (uncommon terms). The terms can be provided as an array or a string.
You call it like: sentence.diff(terms). Below is a unit test:
var sentence = 'The dog ran to the other side of the field.';
var terms = 'the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of';
// NOTE: The "terms" variable could also be an array.
(sentence.diff(terms).toString() === 'dog,ran,other,side,field')
? console.log('pass')
: console.log('fail');
Below is the 'String.diff' function definition:
String.prototype.diff = function(terms){
if (!terms) {
return [];
}
if (typeof terms === 'string') {
terms = terms.split(/,[\s]*/);
}
if (typeof terms !== 'object' || !Array.isArray(terms)) {
return [];
}
terms = terms.map(function(term){
return term.toLowerCase();
});
var words = this.split(/[\W]/).filter(function(word){
return word.length;
});
return words.filter(function(word){
return terms.indexOf(word.toLowerCase()) < 0;
});
};
Here you go:
function getUncommon(sentence, common) {
var wordArr = sentence.match(/\w+/g),
commonObj = {},
uncommonArr = [],
word, i;
common = common.split(',');
for ( i = 0; i < common.length; i++ ) {
commonObj[ common[i].trim() ] = true;
}
for ( i = 0; i < wordArr.length; i++ ) {
word = wordArr[i].trim().toLowerCase();
if ( !commonObj[word] ) {
uncommonArr.push(word);
}
}
return uncommonArr;
}
Live demo: http://jsfiddle.net/simevidas/knXkS/
How about this?
sentence.replace(/\b(?:the|it is|we all|an?|by|to|you|[mh]e|she|they|we...)\b/ig, '');
This should remove all common words from your sentence. Just split the remaining string the way you want.