Skip to content

Commit

Permalink
Merge pull request #9 from axa-group/feature/deutsch_senticon
Browse files Browse the repository at this point in the history
Feature/deutsch senticon
  • Loading branch information
Jesús Seijas authored Aug 26, 2018
2 parents 2ceafb5 + 0a0f0db commit 5cf86bb
Show file tree
Hide file tree
Showing 9 changed files with 155,496 additions and 20 deletions.
15 changes: 8 additions & 7 deletions lib/nlp/nlp-util.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const HungarianStemmer = require('./stemmers/hungarian-stemmer');
const ItalianStemmer = require('./stemmers/italian-stemmer');
const NorwegianStemmer = require('./stemmers/norwegian-stemmer');
const PortugueseStemmer = require('./stemmers/portuguese-stemmer');
const PunctTokenizer = require('./tokenizers/punct-tokenizer');
const RomanianStemmer = require('./stemmers/romanian-stemmer');
const RussianStemmer = require('./stemmers/russian-stemmer');
const SpanishStemmer = require('./stemmers/spanish-stemmer');
Expand Down Expand Up @@ -130,15 +131,15 @@ class NlpUtil {
case 'id': return new AggressiveTokenizerId(); // Indonesian
case 'ja': return new Natural.TokenizerJa(); // Japanese

case 'da': return new Natural.TreebankWordTokenizer(); // Danish
case 'fi': return new Natural.TreebankWordTokenizer(); // Finnish
case 'de': return new Natural.TreebankWordTokenizer(); // German
case 'hu': return new Natural.TreebankWordTokenizer(); // Hungarian
case 'ro': return new Natural.TreebankWordTokenizer(); // Romanian
case 'tr': return new Natural.TreebankWordTokenizer(); // Turkish
case 'da': return new PunctTokenizer(); // Danish
case 'fi': return new PunctTokenizer(); // Finnish
case 'de': return new PunctTokenizer(); // German
case 'hu': return new PunctTokenizer(); // Hungarian
case 'ro': return new PunctTokenizer(); // Romanian
case 'tr': return new PunctTokenizer(); // Turkish
case 'zh': return new ChineseTokenizer(); // Chinese

default: return new Natural.TreebankWordTokenizer();
default: return new PunctTokenizer();
}
}

Expand Down
34 changes: 34 additions & 0 deletions lib/nlp/tokenizers/punct-tokenizer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) AXA Shared Services Spain S.A.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

class PunctTokenizer {
constructor(settings) {
this.settings = settings || {};
}

tokenize(text) {
return text.split(/[\s,.!?;:([\]'"¡¿]+/).filter(x => x !== '');
}
}

module.exports = PunctTokenizer;
3 changes: 3 additions & 0 deletions lib/sentiment/languages/de/negations_de.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"words": ["nicht", "garnicht", "kein", "keine", "keines", "keinem", "keiner", "keinesfalls", "keineswegs"]
}
Loading

0 comments on commit 5cf86bb

Please sign in to comment.