Update for iarg.

zzsfornlp · Apr 15, 2020 · 80a7fa8 · 80a7fa8
1 parent 379020c
commit 80a7fa8
Show file tree

Hide file tree

Showing 60 changed files with 9,664 additions and 149 deletions.
diff --git a/README.md b/README.md
@@ -12,6 +12,10 @@ How to configurate, generally: [here](docs/conf.md)
 
 Related works:
 
+"A Two-Step Approach for Implicit Event Argument Detection": [todo]()
+
+Some other parsers for interested readers: [todo]()
+
 "An Empirical Investigation of Structured Output Modeling for Graph-based Neural Dependency Parsing": [details](docs/emp_graph.md)
 
 "On Difficulties of Cross-Lingual Transfer with Order Differences: A Case Study on Dependency Parsing": [details](docs/cl0.md)
diff --git a/msp/__init__.py b/msp/__init__.py
@@ -1,17 +1,18 @@
 #
 
 # The Mingled Structured Prediction (v0plus) package
-# by zzs (from 2018.02 - now)
+# author: zzs
+# time: 2018.02 - now
 
-# dependencies: pytorch, numpy, scipy, gensim, cython, pybind11
-# conda install pytorch numpy scipy gensim cython pybind11
+# dependencies: pytorch, numpy, scipy, gensim, cython, pybind11, pandas
+# conda install pytorch numpy scipy gensim cython pybind11 pandas
 
 VERSION_MAJOR = 0
 VERSION_MINOR = 1
 VERSION_PATCH = 1
 VERSION_STATUS = "dev"
 
-# TODO(!)
+# specific todos
 # nn optimizer / param groups?
 # check nn module (for simplification?)
 # new model/training/testing scheme -> make it more modularized
@@ -20,6 +21,16 @@
 # easy-to-use calculations result-representing tools for analysis
 # various tools for python as the replacement of direct bash shell
 # gru and cnn have problems?
+# ----
+# -- Next Version principles and goals:
+# nlp data types
+# use type hint
+# checkings and reportings
+# use eval for Conf
+# io and serialization
+# summarize more common patterns, including those in scripts
+# everything has (more flexible) conf
+# more flexible save/load for part of model; (better naming and support dynamic adding and deleting components!!)
 
 def version():
     return (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_STATUS)

diff --git a/msp/data/streamer.py b/msp/data/streamer.py
@@ -116,14 +116,18 @@ class FileOrFdStreamer(Streamer):
     def __init__(self, file_or_fd):
         super().__init__()
         self.file = file_or_fd
-        self.fd = None
+        self.input_is_fd = not isinstance(file_or_fd, str)
+        if self.input_is_fd:
+            self.fd = file_or_fd
+        else:
+            self.fd = None
 
     def __del__(self):
-        if self.fd is not None:
+        if self.fd is not None and not self.input_is_fd:
             self.fd.close()
 
     def _restart(self):
-        if isinstance(self.file, str):
+        if not self.input_is_fd:
             if self.fd is not None:
                 self.fd.close()
             self.fd = zopen(self.file)
@@ -474,7 +478,8 @@ def _next(self):
             while self.buffered_bsize_ < self.k:
                 one = self.base_streamer_.next()
                 if self.base_streamer_.is_eos(one):
-                    break
+                    # todo(+N): this actually does not ensure the end if base_streamer can re-produce things
+                    break  # should have check active, currently skip this, assuming base_streamer's resposibility
                 # dump instances (like short or long instances)
                 dump_instance = any(f_(one) for f_ in self.dump_detectors)
                 if dump_instance:

diff --git a/msp/data/vocab.py b/msp/data/vocab.py
@@ -3,12 +3,12 @@
 from typing import Dict, Sequence
 
 from msp.utils import zopen, zlog, zwarn, zcheck, StrHelper, FileHelper, Helper, JsonRW, PickleRW, printing, Random
-from collections import Iterable, defaultdict
+from collections import Iterable, defaultdict, OrderedDict
 import numpy as np
 import re
 
-# for binary w2v loading
-from gensim.models import KeyedVectors
+# # for binary w2v loading
+# from gensim.models import KeyedVectors
 
 #
 
@@ -190,7 +190,7 @@ def filter_embed(self, wv: 'WordVectors', init_nohit=0., scale=1.0, assert_all_h
 #
 class VocabHelper:
     # todo(0): I guess this will make them unique
-    SPECIAL_PATTERN = re.compile(r"\<z_([a-zA-Z]{3})_z\>")
+    SPECIAL_PATTERN = re.compile(r"\<z_([a-zA-Z]+)_z\>")
 
     @staticmethod
     def extract_name(w):
@@ -310,12 +310,15 @@ def filter_vals(word_vals, word_filter=(lambda ww, rank, val: True)):
 
     # {word->vals} => {word->idx}, [filtered values]
     @staticmethod
-    def ranking_vals(word_vals, pre_list, post_list, default_val, word_filter=(lambda ww, rank, val: True)):
-        ranked_list = Helper.rank_key(word_vals)
+    def ranking_vals(word_vals, pre_list, post_list, default_val, sort_vals, word_filter=(lambda ww, rank, val: True)):
+        if sort_vals:
+            valid_word_list = Helper.rank_key(word_vals)
+        else:
+            valid_word_list = word_vals.keys()
         #
         truncated_vals = [default_val] * len(pre_list)
         v = dict(zip(pre_list, range(len(pre_list))))
-        for ii, ww in enumerate(ranked_list):
+        for ii, ww in enumerate(valid_word_list):
             rank, val = ii+1, word_vals[ww]
             if word_filter(ww, rank, val):
                 v[ww] = len(v)
@@ -337,9 +340,13 @@ def rf_filter(ww,rank,val): return val>=fthres and rank<=rthres
     #
     def finish(self, word_filter=(lambda ww, rank, val: True), sort_by_count=True, target_range=DEFAULT_TARGET_RANGE):
         v = self.v
-        # sort by count-value otherwise adding orders
-        tmp_vals = self.counts_ if sort_by_count else {k:-i for i,k in enumerate(self.keys_) if k in self.counts_}
-        v.v, v.final_vals = VocabBuilder.ranking_vals(tmp_vals, v.pre_list, v.post_list, self.default_val_, word_filter=word_filter)
+        if sort_by_count:
+            v.v, v.final_vals = VocabBuilder.ranking_vals(
+                self.counts_, v.pre_list, v.post_list, self.default_val_, True, word_filter=word_filter)
+        else:
+            tmp_counts_ = OrderedDict([(k, self.counts_[k]) for k in self.keys_])
+            v.v, v.final_vals = VocabBuilder.ranking_vals(
+                tmp_counts_, v.pre_list, v.post_list, self.default_val_, False, word_filter=word_filter)
         v.final_words = Helper.reverse_idx(v.v)
         printing("Build Vocab %s ok, from %d to %d, as %s." % (v.name, len(self.counts_), len(v), str(v)))
         #
@@ -512,6 +519,8 @@ def _load_bin(fname):
         printing("Going to load pre-trained (binary) w2v from %s ..." % fname)
         one = WordVectors()
         #
+        from gensim.models import KeyedVectors
+        #
         kv = KeyedVectors.load_word2vec_format(fname, binary=True)
         # KeyedVectors.save_word2vec_format()
         one.num_words, one.embed_size = len(kv.vectors), len(kv.vectors[0])

diff --git a/msp/nn/backends/bktr.py b/msp/nn/backends/bktr.py
@@ -11,8 +11,10 @@
 from .common import COMMON_CONFIG, get_unique_name, _my_get_params_init
 
 Expr = torch.Tensor
+Module = torch.nn.Module
 CPU_DEVICE = torch.device("cpu")
 DEFAULT_DEVICE = CPU_DEVICE
+T_INIT = torch.nn.init
 
 # types
 float32 = torch.float32
@@ -52,19 +54,28 @@ def is_expr(v):
 is_tensor = is_expr
 
 # parameter init from BK (similar to common.get_params_init)
-# return a tensor here
-def get_params_init(shape, init, lookup):
-    if COMMON_CONFIG.use_my_init:
-        return _my_get_params_init(shape, init, lookup)
+# return a tensor here; (out_p4i is the real shape[0] for more reasonable init for some cases)
+def get_params_init(shape, init, lookup, out_p4i, scale):
+    # if COMMON_CONFIG.use_my_init:
+    #     return _my_get_params_init(shape, init, lookup)
+    assert not COMMON_CONFIG.use_my_init, "now use ones from pytorch for param init"
     x = torch.empty(*shape, dtype=torch.float32, device=DEFAULT_DEVICE)
     if len(shape) == 1:
         nn.init.zeros_(x)
     else:
         if lookup:
-            scale = np.sqrt(3.0 / shape[-1])
-            nn.init.uniform_(x, -scale, scale)
+            _iscale = np.sqrt(3.0 / shape[-1])
+            nn.init.uniform_(x, -_iscale, _iscale)
+            # todo(+N): again back to previous init method
+            # nn.init.normal_(x)
+            x *= scale
         elif init == "default" or init == "glorot":
-            nn.init.xavier_uniform_(x)
+            out_size = shape[0]
+            assert out_size % out_p4i == 0, "Bad output shape pieces for init value!"
+            s0 = out_size//out_p4i
+            for i in range(out_p4i):
+                nn.init.xavier_uniform_(x[i*s0:(i+1)*s0])
+            x *= scale
         elif init == "ortho":
             # todo(note): assume squared matrices
             assert len(shape)==2 and (shape[0]%shape[1]==0 or shape[1]%shape[0]==0), "Invalid shape for ortho init"
@@ -154,10 +165,20 @@ def nnc_name(self, name, check_stack=True):
     def get_unique_name(self, name):
         return get_unique_name(self.name_dict, name)
 
+    # add a torch.nn.Module's parameters
+    def param_add_external(self, name, mod: nn.Module):
+        ret_pairs = []
+        for one_subname, one_param in mod.named_parameters():
+            one_subname = "_".join(one_subname.split("."))  # cannot include "."
+            self.model_.register_parameter(name+"/"+one_subname, one_param)
+            ret_pairs.append((one_subname, one_param))
+        return ret_pairs
+
     # register param
     def param_new(self, name, shape, init_weights, lookup=False):
         # almost all params are float
         p = Parameter(torch.as_tensor(init_weights, dtype=torch.float32, device=DEFAULT_DEVICE))
+        assert name not in self.model_._parameters  # no modules in this pc
         self.model_.register_parameter(name, p)
         return p
 
@@ -178,9 +199,10 @@ def param_set_trainable(self, p, trainable):
     def optimizer_set(self, optim_type, lrf_sv, oconf, params: List = None, check_repeat=True, check_full=False):
         if params is None:
             params = self.model_.parameters()
-        optim = Optim(optim_type, lrf_sv, oconf, params)
-        cur_optid = len(self.optims_)
-        self.optims_.append(optim)
+        if len(params) > 0:
+            optim = Optim(optim_type, lrf_sv, oconf, params)
+            cur_optid = len(self.optims_)
+            self.optims_.append(optim)
         # track all params
         for p in params:
             paramid = id(p)
@@ -344,11 +366,14 @@ def select(t, idxes, dim=0):
 elu = F.elu
 exp = torch.exp
 expand = lambda x, *args: x.expand(*args)
+gelu = getattr(F, "gelu", None)  # todo(warn): on older versions, this does not exist
 log = torch.log
 logsigmoid = F.logsigmoid
 logsumexp = torch.logsumexp
 max = torch.max         # todo(warn): with dim, return tuple
 max_elem = torch.max    # todo(warn): max_elem(a, b)
+min = torch.min
+min_elem = torch.min
 masked_select = torch.masked_select
 matmul = torch.matmul
 pad = F.pad
@@ -536,8 +561,9 @@ def conv(self, input_expr):
 def mask2idx(mask_t, padding_idx=0):
     mask_shape = get_shape(mask_t)  # [*, L]
     counts = mask_t.sum(-1).long()  # [*]
-    max_count = counts.max().item()  # int, the max expanding
-    padding_counts = max_count - counts  # [*]
+    max_count_t = counts.max(-1, keepdim=True)[0]
+    max_count = max_count_t.item()  # int, the max expanding
+    padding_counts = max_count_t - counts  # [*]
     max_padding_count = padding_counts.max().item()  # int, the max count of padding
     pad_t = (arange_idx(max_padding_count) < padding_counts.unsqueeze(-1)).float()  # [*, max_pad]
     concat_t = concat([mask_t, pad_t], -1)  # [*, L+max_pad]
@@ -548,3 +574,11 @@ def mask2idx(mask_t, padding_idx=0):
     valid_mask = (ret_idxes < slen).float()
     ret_idxes[ret_idxes >= slen] = padding_idx
     return ret_idxes, valid_mask
+
+# maxpool 1d at last dim
+def max_pool1d(input, kernel):
+    orig_shape = get_shape(input)
+    # make it 3d
+    tmp_res = F.max_pool1d(input.view([-1]+orig_shape[-2:]), kernel)
+    real_res = tmp_res.view(orig_shape[:-1] + [-1])
+    return real_res
diff --git a/msp/nn/layers/__init__.py b/msp/nn/layers/__init__.py
@@ -2,12 +2,12 @@
 
 from .basic import BasicNode, RefreshOptions, ActivationHelper, Dropout, DropoutLastN
 from .basic import NoDropRop, NoFixRop, FreezeRop
-from .ff import Affine, LayerNorm, MatrixNode, Embedding, PosiEmbedding, RelPosiEmbedding
+from .ff import Affine, LayerNorm, MatrixNode, Embedding, PosiEmbedding, RelPosiEmbedding, PosiEmbedding2
 from .multi import Sequential, Summer, Concater, Joiner, \
-    NodeWrapper, AddNormWrapper, AddActWrapper, HighWayWrapper, get_mlp
-from .enc import RnnNode, GruNode, LstmNode, RnnLayer, RnnLayerBatchFirstWrapper, CnnNode, CnnLayer, \
+    NodeWrapper, AddNormWrapper, AddActWrapper, HighWayWrapper, get_mlp, get_mlp2
+from .enc import RnnNode, GruNode, LstmNode, LstmNode2, RnnLayer, RnnLayerBatchFirstWrapper, CnnNode, CnnLayer, \
     TransformerEncoderLayer, TransformerEncoder, Transformer2EncoderLayer, Transformer2Encoder
 from .att import AttentionNode, FfAttentionNode, MultiHeadAttention, \
     MultiHeadRelationalAttention, MultiHeadSelfDistAttention, AttConf, AttDistHelper
 from .dec import *
-from .biaffine import BiAffineScorer
+from .biaffine import BiAffineScorer, PairScorerConf, PairScorer
diff --git a/msp/nn/layers/basic.py b/msp/nn/layers/basic.py
@@ -2,6 +2,7 @@
 
 from ..backends import BK
 from ..backends.common import get_unique_name
+import numpy as np
 
 from msp.utils import extract_stack
 
@@ -40,7 +41,7 @@ def NoFixRop(): return RefreshOptions(fix_drop=False, fix_set=("fix_drop", ))
 
 # helpers
 class ActivationHelper(object):
-    ACTIVATIONS = {"tanh": BK.tanh, "softmax": BK.softmax, "relu": BK.relu, "elu": BK.elu,
+    ACTIVATIONS = {"tanh": BK.tanh, "softmax": BK.softmax, "relu": BK.relu, "elu": BK.elu, "gelu": BK.gelu,
                    "sigmoid": BK.sigmoid, "linear": lambda x:x}
     # reduction for seq after conv
     POOLINGS = {"max": lambda x: BK.max(x, -2)[0], "avg": lambda x: BK.avg(x, -2)}
@@ -109,11 +110,12 @@ def get_output_dims(self, *input_dims):
         return input_dims
 
     # create param from PC
-    def add_param(self, name, shape, init=None, lookup=False, check_stack=True):
+    def add_param(self, name, shape, init=None, lookup=False, check_stack=True, out_p4i=1, scale=1.):
         if init is None:
-            w = BK.get_params_init(shape, "default", lookup)
-        elif isinstance(init, str):
-            w = BK.get_params_init(shape, init, lookup)
+            init = "default"
+        # -----
+        if isinstance(init, str):
+            w = BK.get_params_init(shape, init, lookup, out_p4i, scale)
         else:
             w = init
         name = self.get_unique_name(name)
@@ -142,9 +144,17 @@ def get_parameters(self, recursively=True):
                 ret.extend(node.get_parameters(recursively))
         return ret
 
+    # count number of parameters
+    def count_allsize_parameters(self, recursively=True):
+        count = 0
+        list_params = self.get_parameters(recursively)
+        for p in list_params:
+            count += np.prod(BK.get_shape(p))
+        return int(count)
+
 # commonly used Nodes
 class Dropout(BasicNode):
-    def __init__(self, pc, shape, which_drop="hdrop", name=None, init_rop=None):
+    def __init__(self, pc, shape, which_drop="hdrop", name=None, init_rop=None, fix_rate=None):
         super().__init__(pc, name, init_rop)
         self.f_ = None
         self.shape = shape
@@ -155,12 +165,18 @@ def __init__(self, pc, shape, which_drop="hdrop", name=None, init_rop=None):
         if which_drop == "gdrop":
             self.rop.fix_drop = True
             self.rop.add_to_fix_set("fix_drop")
+            assert fix_rate is None
+        #
+        self.fix_rate = fix_rate
 
     def refresh(self, rop=None):
         super().refresh(rop)
         #
         r = self.rop
-        drop = self.drop_getter_(r)
+        if self.fix_rate is not None:
+            drop = self.fix_rate
+        else:
+            drop = self.drop_getter_(r)
         # todo(+3): another overall switch, not quite elegant!
         if not r.training:
             self.f_ = lambda x: x