Updates for zfp.

zzsfornlp · Jun 17, 2020 · 842c8b7 · 842c8b7
1 parent 565a6fb
commit 842c8b7
Show file tree

Hide file tree

Showing 24 changed files with 1,617 additions and 47 deletions.
diff --git a/msp/__init__.py b/msp/__init__.py
@@ -23,14 +23,17 @@
 # gru and cnn have problems?
 # ----
 # -- Next Version principles and goals:
-# nlp data types
+# nlp data types & unified data formats!!
+# model and submodules and ... (composition or inheritance??) !!
+# more flexible namings: model/module name, loss name, data-field names, ...
 # use type hint
 # checkings and reportings
 # use eval for Conf
 # io and serialization
 # summarize more common patterns, including those in scripts
 # everything has (more flexible) conf
 # more flexible save/load for part of model; (better naming and support dynamic adding and deleting components!!)
+# a small one: more flexible path finder, for example, multiple upper layers of ".."
 
 def version():
     return (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_STATUS)

diff --git a/msp/nn/backends/bktr.py b/msp/nn/backends/bktr.py
@@ -118,7 +118,9 @@ def update(self, overall_lrate, grad_factor):
             for param_group in self.opt_.param_groups:
                 param_group['lr'] = cur_lrate
             self.cached_lrate_ = cur_lrate
-        if cur_lrate<=0. and self.no_step_lrate0_:
+        # check if we need update
+        parameters = list(filter(lambda p: p.grad is not None, self.params_))
+        if (cur_lrate<=0. and self.no_step_lrate0_) or (len(parameters) == 0):
             # no update
             self.opt_.zero_grad()
         else:

diff --git a/msp/utils/color.py b/msp/utils/color.py
@@ -3,7 +3,7 @@
 # colorful printing
 
 try:
-    from colorama import colorama_init
+    from colorama import init as colorama_init
     colorama_init()
     from colorama import Fore, Back, Style
     RESET_ALL = Style.RESET_ALL

diff --git a/msp/zext/evaler.py b/msp/zext/evaler.py
@@ -35,12 +35,13 @@ def __float__(self):
         return self.f1
 
 class LabelF1Evaler:
-    def __init__(self, name):
+    def __init__(self, name, ignore_none=False):
         # key -> List[labels]
         self.name = name
         self.golds = {}
         self.preds = {}
         self.labels = set()
+        self.ignore_none = ignore_none
 
     # =====
     # adding ones
@@ -53,9 +54,13 @@ def _add_group(self, d: Dict, key, label):
         self.labels.add(label)
 
     def add_gold(self, key, label):
+        if key is None and self.ignore_none:
+            return
         self._add_group(self.golds, key, label)
 
     def add_pred(self, key, label):
+        if key is None and self.ignore_none:
+            return
         self._add_group(self.preds, key, label)
 
     # =====

diff --git a/tasks/zdpar/common/confs.py b/tasks/zdpar/common/confs.py
@@ -47,7 +47,9 @@ def __init__(self):
         # special processing
         self.lower_case = False
         self.norm_digit = False     # norm digits to 0
-        self.use_label0 = False     # using only first-level label
+        self.use_label0 = True     # using only first-level label # todo(note): change the default behaviour
+        zwarn("Note: currently we change default value of 'use_label0' to True!")
+        self.vocab_add_prevalues = True  # add pre-defined UDv2 values when building dicts
         # =====
         # for multi-lingual processing (another option is to pre-processing suitable data)
         # language code (empty str for no effects)
@@ -89,6 +91,9 @@ def __init__(self, partype, args):
         elif partype == "s2":
             from ..ef.parser import S2ParserConf
             self.pconf = S2ParserConf()
+        elif partype == "fp":
+            from ..zfp.fp import FpParserConf
+            self.pconf = FpParserConf()
         else:
             zfatal(f"Unknown parser type: {partype}, please provide correct type with the option.")
         # =====
@@ -125,6 +130,10 @@ def build_model(partype, conf, vpack):
         # two-stage parser
         from ..ef.parser import S2Parser
         parser = S2Parser(pconf, vpack)
+    elif partype == "fp":
+        # the finale parser
+        from ..zfp.fp import FpParser
+        parser = FpParser(pconf, vpack)
     else:
         zfatal("Unknown parser type: %s")
     return parser

diff --git a/tasks/zdpar/common/model.py b/tasks/zdpar/common/model.py
@@ -283,6 +283,7 @@ def __init__(self):
         self.load_process = False
         # batch arranger
         self.batch_size = 32
+        self.train_min_length = 0
         self.train_skip_length = 120
         self.shuffle_train = True
         # optimizer and lrate factor for enc&dec&sl(mid)

diff --git a/tasks/zdpar/common/run.py b/tasks/zdpar/common/run.py
@@ -54,8 +54,8 @@ def index_stream(in_stream, vpack, cached, cache_shuffle, inst_preparer):
 def batch_stream(in_stream, ticonf, training):
     if training:
         b_stream = BatchArranger(in_stream, batch_size=ticonf.batch_size, maxibatch_size=20, batch_size_f=None,
-                                 dump_detectors=lambda one: len(one)>=ticonf.train_skip_length, single_detectors=None,
-                                 sorting_keyer=len, shuffling=ticonf.shuffle_train)
+                                 dump_detectors=lambda one: len(one)>=ticonf.train_skip_length or len(one)<ticonf.train_min_length,
+                                 single_detectors=None, sorting_keyer=len, shuffling=ticonf.shuffle_train)
     else:
         b_stream = BatchArranger(in_stream, batch_size=ticonf.batch_size, maxibatch_size=-1, batch_size_f=None,
                                  dump_detectors=None, single_detectors=lambda one: len(one)>=ticonf.infer_single_length,

diff --git a/tasks/zdpar/common/vocab.py b/tasks/zdpar/common/vocab.py
@@ -22,6 +22,12 @@ def build_by_reading(dconf):
         one.load(dconf.dict_dir)
         return one
 
+    # =====
+    # pre-values for UDv2
+    PRE_VALUES_ULAB = ["punct", "case", "nsubj", "det", "root", "<z_r_z>", "nmod", "advmod", "obj", "obl", "amod", "compound", "aux", "conj", "mark", "cc", "cop", "advcl", "acl", "xcomp", "nummod", "ccomp", "appos", "flat", "parataxis", "discourse", "expl", "fixed", "list", "iobj", "csubj", "goeswith", "vocative", "reparandum", "orphan", "dep", "dislocated", "clf"]
+    PRE_VALUES_UPOS = ["NOUN", "PUNCT", "VERB", "PRON", "ADP", "DET", "PROPN", "<z_r_z>", "ADJ", "AUX", "ADV", "CCONJ", "PART", "NUM", "SCONJ", "X", "INTJ", "SYM"]
+    # =====
+
     @staticmethod
     def build_from_stream(dconf: DConf, stream, extra_stream):
         zlog("Build vocabs from streams.")
@@ -32,13 +38,22 @@ def build_from_stream(dconf: DConf, stream, extra_stream):
         pos_builder = VocabBuilder("pos")
         label_builder = VocabBuilder("label")
         word_normer = ret.word_normer
+        if dconf.vocab_add_prevalues:
+            zlog(f"Add pre-defined values for upos({len(ParserVocabPackage.PRE_VALUES_UPOS)}) and "
+                 f"ulabel({len(ParserVocabPackage.PRE_VALUES_ULAB)}).")
+            pos_builder.feed_stream(ParserVocabPackage.PRE_VALUES_UPOS)
+            label_builder.feed_stream(ParserVocabPackage.PRE_VALUES_ULAB)
         for inst in stream:
             # todo(warn): only do special handling for words
+            # there must be words
             word_builder.feed_stream(word_normer.norm_stream(inst.words.vals))
             for w in inst.words.vals:
                 char_builder.feed_stream(w)
-            pos_builder.feed_stream(inst.poses.vals)
-            label_builder.feed_stream(inst.labels.vals)
+            # pos and label can be optional
+            if inst.poses.has_vals():
+                pos_builder.feed_stream(inst.poses.vals)
+            if inst.labels.has_vals():
+                label_builder.feed_stream(inst.labels.vals)
         #
         w2vec = None
         if dconf.init_from_pretrain:

diff --git a/tasks/zdpar/ef/analysis/ann.py b/tasks/zdpar/ef/analysis/ann.py
@@ -97,7 +97,7 @@ def do_print(self, ocode):
     # -----
     # looking/annotating at specific instances. protocol: target
 
-    # start new annotation task or TODO(!) recover the previous one
+    # start new annotation task or TODO(+N) recover the previous one
     def do_ann_start(self, insts_target: str) -> AnnotationTask:
         assert self.cur_cmd_target is not None, "Should assign this to a var to avoid accidental loss!"
         vs = self.vars

diff --git a/tasks/zdpar/ef/analysis/run1217.py b/tasks/zdpar/ef/analysis/run1217.py
@@ -0,0 +1,177 @@
+#
+
+# case study for the ef one
+# error breakdown on labels,steps; which ones are "easy"(first-decoded) ones
+
+#
+
+import sys
+from collections import Counter
+from msp.zext.ana import AnalyzerConf, Analyzer, ZRecNode, AnnotationTask
+
+try:
+    from .ann import *
+except:
+    from ann import *
+
+#
+class NewAnalysisConf(AnalysisConf):
+    def __init__(self, args):
+        super().__init__(args)
+        #
+        self.step_div = 5
+        self.use_label0 = True
+
+def main(args):
+    conf = NewAnalysisConf(args)
+    # =====
+    if conf.load_name == "":
+        # recalculate them
+        # read them
+        zlog("Read them all ...")
+        gold_parses = list(yield_ones(conf.gold))
+        sys_parses = [list(yield_ones(z)) for z in conf.fs]
+        if conf.use_label0:
+            # todo(note): force using label0 (language-independent)
+            zlog("Force label0 ...")
+            for one_parses in sys_parses + [gold_parses]:
+                for one_parse in one_parses:
+                    for one_token in one_parse.get_tokens():
+                        one_token.label = one_token.label0
+        # use vocab?
+        voc = Vocab.read(conf.vocab) if len(conf.vocab)>0 else None
+        # =====
+        # stat them
+        zlog("Stat them all ...")
+        all_sents, all_tokens = get_objs(gold_parses, sys_parses, conf.getter)
+        analyzer = ParsingAnalyzer(conf.ana, all_sents, all_tokens, conf.labeled, vocab=voc)
+        analyzer.set_var("nsys", len(conf.fs), explanation="init", history_idx=-1)
+        if conf.save_name != "":
+            analyzer.do_save(conf.save_name)
+    else:
+        analyzer = ParsingAnalyzer(conf.ana, None, None, conf.labeled)
+        analyzer.do_load(conf.load_name)
+    # =====
+    # special analysis
+    # ----
+    def _num_same_sibs(_node):
+        # todo(note): here split label again
+        _lab = _node.label
+        if conf.use_label0:
+            _count = sum(z.split(":")[0]==_lab for z in _node.get_head().childs_labels)
+        else:
+            _count = sum(z==_lab for z in _node.get_head().childs_labels)
+        assert _count>=1
+        return _count-1
+    # ----
+    all_sents = analyzer.get_var("sents")
+    nsys = analyzer.get_var("nsys")
+    step_div = conf.step_div  # how many bins for ef-steps?
+    breakdown_labels = {}  # label -> {gold: {count, numsib, dist}, preds: [{count, numsib, dist, lcorr, stepp}]}
+    for _lab in ulabel2type["Nivre17"].keys():
+        breakdown_labels[_lab] = {"gold": {"count": 0, "numsib": 0, "dist": 0},
+                                  "preds": [{"count": 0, "numsib": 0, "dist": 0, "lcorr": 0, "stepp": 0} for _ in range(nsys)]}
+    breakdown_steps = {}  # stepbin -> {count, dist, counter(label), acc, acc-all}
+    for _stepbin in range(step_div):
+        breakdown_steps[_stepbin] = {"count": 0, "dist": 0, "labels": Counter(), "lcorrs": [0]*nsys}
+    # -----
+    # collect
+    for one_sobj in all_sents:
+        cur_length = one_sobj.len
+        for one_tobj in one_sobj.rtoks:  # all real toks
+            # -----
+            # get stat
+            gold_label = one_tobj.g.label
+            gold_numsib = _num_same_sibs(one_tobj.g)
+            gold_dist = abs(one_tobj.g.ddist)
+            # breakdown-label
+            breakdown_labels[gold_label]["gold"]["count"] += 1
+            breakdown_labels[gold_label]["gold"]["numsib"] += gold_numsib
+            breakdown_labels[gold_label]["gold"]["dist"] += gold_dist
+            for i, p in enumerate(one_tobj.ss):
+                pred_label = p.label
+                if pred_label in ["<z_non_z>", "<z_r_z>"]:
+                    pred_label = "dep"  # todo(note): fix padding prediction
+                pred_numsib = _num_same_sibs(p)
+                pred_dist = abs(p.ddist)
+                pred_lcorr = p.lcorr
+                pred_stepi = getattr(p, "efi", None)
+                if pred_stepi is None:
+                    pred_stepi = getattr(p, "gmi", None)
+                assert pred_stepi is not None
+                pred_stepbin = int(pred_stepi*step_div/cur_length)
+                pred_stepp = pred_stepi / cur_length
+                # breakdown-label
+                breakdown_labels[pred_label]["preds"][i]["count"] += 1
+                breakdown_labels[pred_label]["preds"][i]["numsib"] += pred_numsib
+                breakdown_labels[pred_label]["preds"][i]["dist"] += pred_dist
+                breakdown_labels[pred_label]["preds"][i]["lcorr"] += pred_lcorr
+                breakdown_labels[pred_label]["preds"][i]["stepp"] += pred_stepp
+                # breakdown-steps
+                if i==0:  # todo(note): only record the first one!!
+                    breakdown_steps[pred_stepbin]["count"] += 1
+                    breakdown_steps[pred_stepbin]["dist"] += pred_dist
+                    breakdown_steps[pred_stepbin]["labels"][pred_label] += 1
+                    for i2, p2 in enumerate(one_tobj.ss):  # all nodes' correctness for this certain node!
+                        breakdown_steps[pred_stepbin]["lcorrs"][i2] += p2.lcorr
+    # -----
+    # summary
+    data_labels = []
+    for k, dd in breakdown_labels.items():
+        gold_count = max(dd["gold"]["count"], 1e-5)
+        res = {"K": k, "gold_count": gold_count, "numsib": dd["gold"]["numsib"]/gold_count,
+               "dist": dd["gold"]["dist"]/gold_count}
+        for pidx, preds in enumerate(dd["preds"]):
+            pred_count = max(preds["count"], 1e-5)
+            res[f"pred{pidx}_count"] = pred_count
+            res[f"pred{pidx}_numsib"] = preds["numsib"]/pred_count
+            res[f"pred{pidx}_dist"] = preds["dist"]/pred_count
+            res[f"pred{pidx}_stepp"] = preds["stepp"]/pred_count
+            P, R = preds["lcorr"]/pred_count, preds["lcorr"]/gold_count
+            F = 2*P*R/(P+R) if (P+R)>0 else 0.
+            res.update({f"pred{pidx}_P": P, f"pred{pidx}_R": R, f"pred{pidx}_F": F})
+        data_labels.append(res)
+    data_steps = []
+    TOP_LABEL_K = 5
+    for k, dd in breakdown_steps.items():
+        dd_count = max(dd["count"], 1e-5)
+        res = {"K": k, "count": dd_count, "dist": dd["dist"]/dd_count}
+        for common_idx, common_p in enumerate(dd["labels"].most_common(TOP_LABEL_K)):
+            common_label, common_count = common_p
+            res[f"common{common_idx}"] = f"{common_label}({common_count/dd['count']:.3f})"
+        for pidx, pcorr in enumerate(dd["lcorrs"]):
+            res[f"pred{pidx}_acc"] = pcorr/dd_count
+        data_steps.append(res)
+    # =====
+    pd_labels = pd.DataFrame({k: [d[k] for d in data_labels] for k in data_labels[0].keys()})
+    pd_labels = pd_labels.sort_values(by="gold_count", ascending=False)
+    selections = ["K", "gold_count", "numsib", "dist", "pred0_numsib", "pred0_stepp", "pred0_F",
+                  "pred1_numsib", "pred1_stepp", "pred1_F"]
+    pd_labels2 = pd_labels[selections]
+    pd_steps = pd.DataFrame({k: [d[k] for d in data_steps] for k in data_steps[0].keys()})
+    zlog(f"#-----\nLABELS: \n{pd_labels2.to_string()}\n\n")
+    zlog(f"#-----\nSTEPS: \n{pd_steps.to_string()}\n\n")
+    # specific table
+    TABLE_LABEL_K = 10
+    num_all_tokens = sum(z["gold_count"] for z in data_labels)
+    lines = []
+    for i in range(TABLE_LABEL_K):
+        ss = pd_labels.iloc[i]
+        fields = [ss["K"], f"{ss['gold_count']/num_all_tokens:.2f}", f"{ss['numsib']:.2f}", f"{ss['dist']:.2f}",
+                  f"{ss['pred0_F']*100:.2f}", f"{ss['pred0_numsib']:.2f}",
+                  f"{ss['pred1_F']*100:.2f}", f"{ss['pred1_numsib']:.2f}"]
+        lines.append(" & ".join(fields))
+    table_ss = "\\\\\n".join(lines)
+    zlog(f"#=====\n{table_ss}")
+    # -----
+    # import pdb
+    # pdb.set_trace()
+    return
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+
+# runnings
+"""
+PYTHONPATH=../../src/ python3 -m pdb run.py gold:en_dev.gold fs:en_dev.zef_ru.pred,en_dev.zg1_ru.pred
+"""
diff --git a/tasks/zdpar/main/build_vocab.py b/tasks/zdpar/main/build_vocab.py
@@ -0,0 +1,23 @@
+#
+
+# building vocabs
+# the initial part of training
+
+from ..common.confs import init_everything
+from ..common.data import get_data_reader, get_multisoure_data_reader
+from ..common.vocab import ParserVocabPackage
+
+def main(args):
+    conf = init_everything(args+["partype:fp"])
+    dconf = conf.dconf
+    if dconf.multi_source:
+        _reader_getter = get_multisoure_data_reader
+    else:
+        _reader_getter = get_data_reader
+    train_streamer = _reader_getter(dconf.train, dconf.input_format, dconf.code_train, dconf.use_label0, cut=dconf.cut_train)
+    vpack = ParserVocabPackage.build_from_stream(dconf, train_streamer, [])  # empty extra_stream
+    vpack.save(dconf.dict_dir)
+
+# SRC_DIR="../src/"
+# PYTHONPATH=${SRC_DIR}/ python3 ${SRC_DIR}/tasks/cmd.py zdpar.main.build_vocab train:[input] input_format:conllu dict_dir:[output_dir] init_from_pretrain:0 pretrain_file:?
+# PYTHONPATH=${SRC_DIR}/ python3 ${SRC_DIR}/tasks/cmd.py zdpar.main.build_vocab train:../data/UD_RUN/ud24/en_all.conllu input_format:conllu dict_dir:./vocab/ init_from_pretrain:1 pretrain_file:../data/UD_RUN/ud24/wiki.multi.en.filtered.vec pretrain_scale:1 pretrain_init_nohit:1
diff --git a/tasks/zdpar/stat2/__init__.py b/tasks/zdpar/stat2/__init__.py
@@ -1,11 +1,15 @@
 #
 
-# todo-list
+# trying-list
 # topical influence (use actual bert's predicted output to delete special semantics?) -> change words: first fix non-changed, then find repeated topic words, then change topic and hard words one per segment -> (191103: change too much may hurt)
 # predict-leaf (use first half of order as leaf, modify the scores according to this first half?) -> (191104: still not good)
 # other reduce criterion? -> (191105: not too much diff, do not get it too complex...)
 # vocab based reduce? simply <= thresh? -> (191106: ok, help a little, but not our target)
-# cky + decide direction later
-# only change topical words?
-# direct parse subword seq? (group and take max/avg-score)
-# cluster (hierachical bag of words, direction?, influence range, grouped influence)
+# cky + decide direction later -> (191111: still not good, but first pdb-debug; 191112: still not good)
+# cky + two-layer by splitting puncts? -> (191113: only slightly helpful)
+# stop words (<100 in voacb) as lower ones? -> (191114: worse than pos-rule)
+# check wsj and phrase result? -> (191114: f1 around 40)
+# only change topical words? -> (191115: change 883/25148, no obvious diff)
+# direct parse subword seq? (group and take max/avg-score) -> (191115: max-score seems to be slightly helpful +1)
+# cluster (hierachical bag of words, direction?, influence range, grouped influence) -> (191115: similar, but slightly worse than cky)
+# -- ok, here is the end, goto next stage ...