From 9c3a41d5545fed26c081b33f37be5fec2eb2385d Mon Sep 17 00:00:00 2001
From: Zifei Tong <zifeitong@gmail.com>
Date: Wed, 27 May 2015 14:53:22 +0800
Subject: [PATCH] Add utf8 character support

---
 util/CharSplitLMMinibatchLoader.lua | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/util/CharSplitLMMinibatchLoader.lua b/util/CharSplitLMMinibatchLoader.lua
index 1fafe398..b428a57f 100644
--- a/util/CharSplitLMMinibatchLoader.lua
+++ b/util/CharSplitLMMinibatchLoader.lua
@@ -95,8 +95,11 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     print('creating vocabulary mapping...')
     -- record all characters to a set
     local unordered = {}
-    for char in rawdata:gmatch'.' do
+    local len = 0
+    -- code snippets taken from http://lua-users.org/wiki/LuaUnicode
+    for char in string.gfind(rawdata, "([%z\1-\127\194-\244][\128-\191]*)") do
         if not unordered[char] then unordered[char] = true end
+        len = len + 1
     end
     -- sort into a table (i.e. keys become 1..N)
     local ordered = {}
@@ -109,9 +112,11 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     end
     -- construct a tensor with all the data
     print('putting data into tensor...')
-    local data = torch.ByteTensor(#rawdata) -- store it into 1D first, then rearrange
-    for i=1, #rawdata do
-        data[i] = vocab_mapping[rawdata:sub(i, i)] -- lua has no string indexing using []
+    local data = torch.ShortTensor(len) -- store it into 1D first, then rearrange
+    local pos = 1
+    for char in string.gfind(rawdata, "([%z\1-\127\194-\244][\128-\191]*)") do
+        data[pos] = vocab_mapping[char]
+        pos = pos + 1
     end
 
     -- save output preprocessed files