Merge pull request #9 from axa-group/feature/deutsch_senticon

Feature/deutsch senticon
axa-group · Aug 26, 2018 · 5cf86bb · 5cf86bb
2 parents 2ceafb5 + 0a0f0db
commit 5cf86bb
Show file tree

Hide file tree

Showing 9 changed files with 155,496 additions and 20 deletions.
diff --git a/lib/nlp/nlp-util.js b/lib/nlp/nlp-util.js
@@ -35,6 +35,7 @@ const HungarianStemmer = require('./stemmers/hungarian-stemmer');
 const ItalianStemmer = require('./stemmers/italian-stemmer');
 const NorwegianStemmer = require('./stemmers/norwegian-stemmer');
 const PortugueseStemmer = require('./stemmers/portuguese-stemmer');
+const PunctTokenizer = require('./tokenizers/punct-tokenizer');
 const RomanianStemmer = require('./stemmers/romanian-stemmer');
 const RussianStemmer = require('./stemmers/russian-stemmer');
 const SpanishStemmer = require('./stemmers/spanish-stemmer');
@@ -130,15 +131,15 @@ class NlpUtil {
       case 'id': return new AggressiveTokenizerId(); // Indonesian
       case 'ja': return new Natural.TokenizerJa(); // Japanese
 
-      case 'da': return new Natural.TreebankWordTokenizer(); // Danish
-      case 'fi': return new Natural.TreebankWordTokenizer(); // Finnish
-      case 'de': return new Natural.TreebankWordTokenizer(); // German
-      case 'hu': return new Natural.TreebankWordTokenizer(); // Hungarian
-      case 'ro': return new Natural.TreebankWordTokenizer(); // Romanian
-      case 'tr': return new Natural.TreebankWordTokenizer(); // Turkish
+      case 'da': return new PunctTokenizer(); // Danish
+      case 'fi': return new PunctTokenizer(); // Finnish
+      case 'de': return new PunctTokenizer(); // German
+      case 'hu': return new PunctTokenizer(); // Hungarian
+      case 'ro': return new PunctTokenizer(); // Romanian
+      case 'tr': return new PunctTokenizer(); // Turkish
       case 'zh': return new ChineseTokenizer(); // Chinese
 
-      default: return new Natural.TreebankWordTokenizer();
+      default: return new PunctTokenizer();
     }
   }
 

diff --git a/lib/nlp/tokenizers/punct-tokenizer.js b/lib/nlp/tokenizers/punct-tokenizer.js
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) AXA Shared Services Spain S.A.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+class PunctTokenizer {
+  constructor(settings) {
+    this.settings = settings || {};
+  }
+
+  tokenize(text) {
+    return text.split(/[\s,.!?;:([\]'"¡¿]+/).filter(x => x !== '');
+  }
+}
+
+module.exports = PunctTokenizer;
diff --git a/lib/sentiment/languages/de/negations_de.json b/lib/sentiment/languages/de/negations_de.json
@@ -0,0 +1,3 @@
+{
+  "words": ["nicht", "garnicht", "kein", "keine", "keines", "keinem", "keiner", "keinesfalls", "keineswegs"]
+}