Add some other parsers.

zzsfornlp · Jan 19, 2020 · 5f7d454 · 5f7d454
1 parent ed43cc5
commit 5f7d454
Show file tree

Hide file tree

Showing 123 changed files with 16,726 additions and 915 deletions.
diff --git a/msp/__init__.py b/msp/__init__.py
@@ -1,18 +1,28 @@
 #
 
-# The Mingle Structured Prediction (v0plus) package
+# The Mingled Structured Prediction (v0plus) package
 # by zzs (from 2018.02 - now)
 
-# dependencies: pytorch, numpy, scipy, gensim, cython
+# dependencies: pytorch, numpy, scipy, gensim, cython, pybind11
+# conda install pytorch numpy scipy gensim cython pybind11
 
 VERSION_MAJOR = 0
-VERSION_MINOR = 0
+VERSION_MINOR = 1
 VERSION_PATCH = 1
-
+VERSION_STATUS = "dev"
+
+# TODO(!)
+# nn optimizer / param groups?
+# check nn module (for simplification?)
+# new model/training/testing scheme -> make it more modularized
+# Conf's init: what types
+# dropout setting: use training/testing(with/wo)-mode
+# easy-to-use calculations result-representing tools for analysis
+# various tools for python as the replacement of direct bash shell
+# gru and cnn have problems?
 
 def version():
-    return (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)
-
+    return (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_STATUS)
 
 __version__ = ".".join(str(z) for z in version())
 
@@ -26,9 +36,9 @@ def version():
 # !!: (again-corrected) The goal is not to build a whole framework, but several useful pieces.
 
 # conventions
-# todo(0)/todo(warn): simply warning
-# todo(+N): need what level of efforts
-# TODO(!): unfinished, real todo
+# todo(0)/todo(warn)/todo(note): simply warning or noting
+# todo(+N): need what level of efforts, +N means lots of efforts
+# TODO: unfinished, real todo
 
 # hierarchically: msp -> scripts / tasks, no reverse ref allowed!
 

diff --git a/msp/cmp/zarray.py b/msp/cmp/zarray.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 
+# todo(warn): should use pandas!
+
 class ZArray:
     def __init__(self, arr: np.ndarray, names=None):
         self.arr = arr

diff --git a/msp/cmp/ztab.py b/msp/cmp/ztab.py
@@ -0,0 +1,34 @@
+#
+
+#
+class ZTab:
+    @staticmethod
+    def read(ss, fsep='\t', lsep='\n'):
+        pieces = [z.split(fsep) for z in ss.split(lsep)]
+        num_col = len(pieces[0]) - 1
+        num_row = len(pieces) - 1
+        row_names = [z[0] for z in pieces[1:]]
+        col_names = pieces[0][1:]
+        ret = {}
+        # add entries
+        for one_row in pieces[1:]:
+            k = one_row[0]
+            assert k not in ret
+            cur_m = {}
+            assert len(one_row)-1 == num_col
+            for n,v in zip(col_names, one_row[1:]):
+                cur_m[n] = v
+            ret[k] = cur_m
+        return row_names, col_names, ret
+
+    @staticmethod
+    def write(row_names, col_names, value_f, fsep='\t', lsep='\n'):
+        pieces = []
+        # add head line
+        pieces.append([""] + col_names)
+        # add each rows
+        for r in row_names:
+            pieces.append([r] + [value_f(r, c) for c in col_names])
+        # finally concat
+        ss = lsep.join([fsep.join(z) for z in pieces])
+        return ss
diff --git a/msp/data/__init__.py b/msp/data/__init__.py
@@ -3,4 +3,4 @@
 from .data import Instance, TextReader, FdReader, WordNormer
 from .vocab import Vocab, VocabHelper, VocabBuilder, WordVectors, VocabPackage, MultiHelper
 from .streamer import Streamer, AdapterStreamer, FAdapterStreamer, FileOrFdStreamer, BatchArranger, InstCacher, \
-    MultiCatStreamer, IterStreamer, MultiZipStreamer
+    MultiCatStreamer, IterStreamer, MultiZipStreamer, FListAdapterStream, MultiJoinStreamer, ShuffleStreamer
diff --git a/msp/data/streamer.py b/msp/data/streamer.py
@@ -1,6 +1,6 @@
 #
 
-from msp.utils import zopen, zcheck, zfatal, ZException
+from msp.utils import zopen, zcheck, zfatal
 from msp.utils import Constants, Random
 from typing import Iterable, Sequence
 
@@ -147,10 +147,25 @@ def _next(self):
         else:
             return z
 
-# f can returns a list (augment or filter)
-# TODO(+1)
+# f: Inst -> List[Inst] (augment or filter)
 class FListAdapterStream(AdapterStreamer):
-    pass
+    def __init__(self, base_streamer, f):
+        super().__init__(base_streamer)
+        self.f = f
+        self.cache = []
+        self.cache_idx = 0
+
+    def _next(self):
+        while self.cache_idx >= len(self.cache):
+            one = self.base_streamer_.next()
+            if self.base_streamer_.is_eos(one):
+                return None
+            self.cache.clear()
+            self.cache_idx = 0
+            self.cache.extend(self.f(one))
+        r = self.cache[self.cache_idx]
+        self.cache_idx += 1
+        return r
 
 # =====
 # Random Streams
@@ -202,13 +217,6 @@ def _next(self):
             if self.rs_.next():
                 return one
 
-# TODO(+2)
-class ShuffleStreamer(AdapterStreamer):
-    # -1 means read all and shuffle
-    def __init__(self, base_streamer, batch_size=-1):
-        super().__init__(base_streamer)
-        raise NotImplementedError()
-
 # =====
 # Multi Streamers
 
@@ -247,6 +255,45 @@ def _restart(self):
             one.restart()
         self._mc_set_cur(0)
 
+#
+# like MultiCat, but join the streams (horizontally) rather than concat (vertically)
+# todo(note): currently mixing them 1 by 1; maybe not efficient enough if there are too many streamers
+class MultiJoinStreamer(MultiStreamer):
+    def __init__(self, base_streamers: Sequence):
+        super().__init__(base_streamers)
+        self.cur_pointer = 0
+        self.ended = [False] * len(self.base_streamers_)
+
+    def _next(self):
+        # find the next active streamer
+        starting_point = self.cur_pointer
+        num_streamers = self.num_streamers_
+        cur_point = starting_point
+        flag_success = False
+        one = None
+        for i in range(num_streamers):  # at most travel one round
+            if not self.ended[cur_point]:
+                new_streamer = self.base_streamers_[cur_point]
+                one = new_streamer.next()
+                if new_streamer.is_eos(one):
+                    self.ended[cur_point] = True
+                else:
+                    flag_success = True
+            cur_point = (cur_point + 1) % num_streamers
+            if flag_success:
+                break
+        self.cur_pointer = cur_point
+        if flag_success:
+            return one
+        else:
+            return None
+
+    def _restart(self):
+        for one in self.base_streamers_:
+            one.restart()
+        self.cur_pointer = 0
+        self.ended = [False] * len(self.base_streamers_)
+
 #
 # zip-like multi-streamer, with various modes, return a list of instances
 # -- can be useful to combine with LoopStreamer for wrapping-around
@@ -301,6 +348,9 @@ def get(self):
     def reset(self):
         raise NotImplementedError()
 
+    def clear(self):
+        raise NotImplementedError()
+
     def __len__(self):
         raise NotImplementedError()
 
@@ -325,6 +375,10 @@ def get(self):
             self.ptr += 1
             return self.c[self.ptr-1]
 
+    def clear(self):
+        self.c.clear()
+        self.ptr = 0
+
     def reset(self):
         self.ptr = 0
         if self.shuffle:
@@ -347,6 +401,24 @@ def _restart(self):
                 self.cache.put(one)
         self.cache.reset()
 
+# todo(+1): partial shuffle to avoid read all?
+class ShuffleStreamer(AdapterStreamer):
+    # -1 means read all and shuffle
+    def __init__(self, src_stream, cache_builder=InplacedCache):
+        super().__init__(src_stream)
+        self.src = src_stream
+        self.cache = cache_builder(shuffle=True)
+
+    def _next(self):
+        return self.cache.get()
+
+    def _restart(self):
+        # todo(note): rebuild cache each time (do not need to call super, since for-loop will trigger the src.restart)
+        self.cache.clear()
+        for one in self.src:
+            self.cache.put(one)
+        self.cache.reset()
+
 # =====
 # BatchHelper