Skip to content

Commit

Permalink
Updates for zfp.
Browse files Browse the repository at this point in the history
  • Loading branch information
zzsfornlp committed Jun 17, 2020
1 parent 565a6fb commit 842c8b7
Show file tree
Hide file tree
Showing 24 changed files with 1,617 additions and 47 deletions.
5 changes: 4 additions & 1 deletion msp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,17 @@
# gru and cnn have problems?
# ----
# -- Next Version principles and goals:
# nlp data types
# nlp data types & unified data formats!!
# model and submodules and ... (composition or inheritance??) !!
# more flexible namings: model/module name, loss name, data-field names, ...
# use type hint
# checkings and reportings
# use eval for Conf
# io and serialization
# summarize more common patterns, including those in scripts
# everything has (more flexible) conf
# more flexible save/load for part of model; (better naming and support dynamic adding and deleting components!!)
# a small one: more flexible path finder, for example, multiple upper layers of ".."

def version():
return (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_STATUS)
Expand Down
4 changes: 3 additions & 1 deletion msp/nn/backends/bktr.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ def update(self, overall_lrate, grad_factor):
for param_group in self.opt_.param_groups:
param_group['lr'] = cur_lrate
self.cached_lrate_ = cur_lrate
if cur_lrate<=0. and self.no_step_lrate0_:
# check if we need update
parameters = list(filter(lambda p: p.grad is not None, self.params_))
if (cur_lrate<=0. and self.no_step_lrate0_) or (len(parameters) == 0):
# no update
self.opt_.zero_grad()
else:
Expand Down
2 changes: 1 addition & 1 deletion msp/utils/color.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# colorful printing

try:
from colorama import colorama_init
from colorama import init as colorama_init
colorama_init()
from colorama import Fore, Back, Style
RESET_ALL = Style.RESET_ALL
Expand Down
7 changes: 6 additions & 1 deletion msp/zext/evaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,13 @@ def __float__(self):
return self.f1

class LabelF1Evaler:
def __init__(self, name):
def __init__(self, name, ignore_none=False):
# key -> List[labels]
self.name = name
self.golds = {}
self.preds = {}
self.labels = set()
self.ignore_none = ignore_none

# =====
# adding ones
Expand All @@ -53,9 +54,13 @@ def _add_group(self, d: Dict, key, label):
self.labels.add(label)

def add_gold(self, key, label):
if key is None and self.ignore_none:
return
self._add_group(self.golds, key, label)

def add_pred(self, key, label):
if key is None and self.ignore_none:
return
self._add_group(self.preds, key, label)

# =====
Expand Down
11 changes: 10 additions & 1 deletion tasks/zdpar/common/confs.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def __init__(self):
# special processing
self.lower_case = False
self.norm_digit = False # norm digits to 0
self.use_label0 = False # using only first-level label
self.use_label0 = True # using only first-level label # todo(note): change the default behaviour
zwarn("Note: currently we change default value of 'use_label0' to True!")
self.vocab_add_prevalues = True # add pre-defined UDv2 values when building dicts
# =====
# for multi-lingual processing (another option is to pre-processing suitable data)
# language code (empty str for no effects)
Expand Down Expand Up @@ -89,6 +91,9 @@ def __init__(self, partype, args):
elif partype == "s2":
from ..ef.parser import S2ParserConf
self.pconf = S2ParserConf()
elif partype == "fp":
from ..zfp.fp import FpParserConf
self.pconf = FpParserConf()
else:
zfatal(f"Unknown parser type: {partype}, please provide correct type with the option.")
# =====
Expand Down Expand Up @@ -125,6 +130,10 @@ def build_model(partype, conf, vpack):
# two-stage parser
from ..ef.parser import S2Parser
parser = S2Parser(pconf, vpack)
elif partype == "fp":
# the finale parser
from ..zfp.fp import FpParser
parser = FpParser(pconf, vpack)
else:
zfatal("Unknown parser type: %s")
return parser
Expand Down
1 change: 1 addition & 0 deletions tasks/zdpar/common/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ def __init__(self):
self.load_process = False
# batch arranger
self.batch_size = 32
self.train_min_length = 0
self.train_skip_length = 120
self.shuffle_train = True
# optimizer and lrate factor for enc&dec&sl(mid)
Expand Down
4 changes: 2 additions & 2 deletions tasks/zdpar/common/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def index_stream(in_stream, vpack, cached, cache_shuffle, inst_preparer):
def batch_stream(in_stream, ticonf, training):
if training:
b_stream = BatchArranger(in_stream, batch_size=ticonf.batch_size, maxibatch_size=20, batch_size_f=None,
dump_detectors=lambda one: len(one)>=ticonf.train_skip_length, single_detectors=None,
sorting_keyer=len, shuffling=ticonf.shuffle_train)
dump_detectors=lambda one: len(one)>=ticonf.train_skip_length or len(one)<ticonf.train_min_length,
single_detectors=None, sorting_keyer=len, shuffling=ticonf.shuffle_train)
else:
b_stream = BatchArranger(in_stream, batch_size=ticonf.batch_size, maxibatch_size=-1, batch_size_f=None,
dump_detectors=None, single_detectors=lambda one: len(one)>=ticonf.infer_single_length,
Expand Down
19 changes: 17 additions & 2 deletions tasks/zdpar/common/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ def build_by_reading(dconf):
one.load(dconf.dict_dir)
return one

# =====
# pre-values for UDv2
PRE_VALUES_ULAB = ["punct", "case", "nsubj", "det", "root", "<z_r_z>", "nmod", "advmod", "obj", "obl", "amod", "compound", "aux", "conj", "mark", "cc", "cop", "advcl", "acl", "xcomp", "nummod", "ccomp", "appos", "flat", "parataxis", "discourse", "expl", "fixed", "list", "iobj", "csubj", "goeswith", "vocative", "reparandum", "orphan", "dep", "dislocated", "clf"]
PRE_VALUES_UPOS = ["NOUN", "PUNCT", "VERB", "PRON", "ADP", "DET", "PROPN", "<z_r_z>", "ADJ", "AUX", "ADV", "CCONJ", "PART", "NUM", "SCONJ", "X", "INTJ", "SYM"]
# =====

@staticmethod
def build_from_stream(dconf: DConf, stream, extra_stream):
zlog("Build vocabs from streams.")
Expand All @@ -32,13 +38,22 @@ def build_from_stream(dconf: DConf, stream, extra_stream):
pos_builder = VocabBuilder("pos")
label_builder = VocabBuilder("label")
word_normer = ret.word_normer
if dconf.vocab_add_prevalues:
zlog(f"Add pre-defined values for upos({len(ParserVocabPackage.PRE_VALUES_UPOS)}) and "
f"ulabel({len(ParserVocabPackage.PRE_VALUES_ULAB)}).")
pos_builder.feed_stream(ParserVocabPackage.PRE_VALUES_UPOS)
label_builder.feed_stream(ParserVocabPackage.PRE_VALUES_ULAB)
for inst in stream:
# todo(warn): only do special handling for words
# there must be words
word_builder.feed_stream(word_normer.norm_stream(inst.words.vals))
for w in inst.words.vals:
char_builder.feed_stream(w)
pos_builder.feed_stream(inst.poses.vals)
label_builder.feed_stream(inst.labels.vals)
# pos and label can be optional
if inst.poses.has_vals():
pos_builder.feed_stream(inst.poses.vals)
if inst.labels.has_vals():
label_builder.feed_stream(inst.labels.vals)
#
w2vec = None
if dconf.init_from_pretrain:
Expand Down
2 changes: 1 addition & 1 deletion tasks/zdpar/ef/analysis/ann.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def do_print(self, ocode):
# -----
# looking/annotating at specific instances. protocol: target

# start new annotation task or TODO(!) recover the previous one
# start new annotation task or TODO(+N) recover the previous one
def do_ann_start(self, insts_target: str) -> AnnotationTask:
assert self.cur_cmd_target is not None, "Should assign this to a var to avoid accidental loss!"
vs = self.vars
Expand Down
177 changes: 177 additions & 0 deletions tasks/zdpar/ef/analysis/run1217.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#

# case study for the ef one
# error breakdown on labels,steps; which ones are "easy"(first-decoded) ones

#

import sys
from collections import Counter
from msp.zext.ana import AnalyzerConf, Analyzer, ZRecNode, AnnotationTask

try:
from .ann import *
except:
from ann import *

#
class NewAnalysisConf(AnalysisConf):
def __init__(self, args):
super().__init__(args)
#
self.step_div = 5
self.use_label0 = True

def main(args):
conf = NewAnalysisConf(args)
# =====
if conf.load_name == "":
# recalculate them
# read them
zlog("Read them all ...")
gold_parses = list(yield_ones(conf.gold))
sys_parses = [list(yield_ones(z)) for z in conf.fs]
if conf.use_label0:
# todo(note): force using label0 (language-independent)
zlog("Force label0 ...")
for one_parses in sys_parses + [gold_parses]:
for one_parse in one_parses:
for one_token in one_parse.get_tokens():
one_token.label = one_token.label0
# use vocab?
voc = Vocab.read(conf.vocab) if len(conf.vocab)>0 else None
# =====
# stat them
zlog("Stat them all ...")
all_sents, all_tokens = get_objs(gold_parses, sys_parses, conf.getter)
analyzer = ParsingAnalyzer(conf.ana, all_sents, all_tokens, conf.labeled, vocab=voc)
analyzer.set_var("nsys", len(conf.fs), explanation="init", history_idx=-1)
if conf.save_name != "":
analyzer.do_save(conf.save_name)
else:
analyzer = ParsingAnalyzer(conf.ana, None, None, conf.labeled)
analyzer.do_load(conf.load_name)
# =====
# special analysis
# ----
def _num_same_sibs(_node):
# todo(note): here split label again
_lab = _node.label
if conf.use_label0:
_count = sum(z.split(":")[0]==_lab for z in _node.get_head().childs_labels)
else:
_count = sum(z==_lab for z in _node.get_head().childs_labels)
assert _count>=1
return _count-1
# ----
all_sents = analyzer.get_var("sents")
nsys = analyzer.get_var("nsys")
step_div = conf.step_div # how many bins for ef-steps?
breakdown_labels = {} # label -> {gold: {count, numsib, dist}, preds: [{count, numsib, dist, lcorr, stepp}]}
for _lab in ulabel2type["Nivre17"].keys():
breakdown_labels[_lab] = {"gold": {"count": 0, "numsib": 0, "dist": 0},
"preds": [{"count": 0, "numsib": 0, "dist": 0, "lcorr": 0, "stepp": 0} for _ in range(nsys)]}
breakdown_steps = {} # stepbin -> {count, dist, counter(label), acc, acc-all}
for _stepbin in range(step_div):
breakdown_steps[_stepbin] = {"count": 0, "dist": 0, "labels": Counter(), "lcorrs": [0]*nsys}
# -----
# collect
for one_sobj in all_sents:
cur_length = one_sobj.len
for one_tobj in one_sobj.rtoks: # all real toks
# -----
# get stat
gold_label = one_tobj.g.label
gold_numsib = _num_same_sibs(one_tobj.g)
gold_dist = abs(one_tobj.g.ddist)
# breakdown-label
breakdown_labels[gold_label]["gold"]["count"] += 1
breakdown_labels[gold_label]["gold"]["numsib"] += gold_numsib
breakdown_labels[gold_label]["gold"]["dist"] += gold_dist
for i, p in enumerate(one_tobj.ss):
pred_label = p.label
if pred_label in ["<z_non_z>", "<z_r_z>"]:
pred_label = "dep" # todo(note): fix padding prediction
pred_numsib = _num_same_sibs(p)
pred_dist = abs(p.ddist)
pred_lcorr = p.lcorr
pred_stepi = getattr(p, "efi", None)
if pred_stepi is None:
pred_stepi = getattr(p, "gmi", None)
assert pred_stepi is not None
pred_stepbin = int(pred_stepi*step_div/cur_length)
pred_stepp = pred_stepi / cur_length
# breakdown-label
breakdown_labels[pred_label]["preds"][i]["count"] += 1
breakdown_labels[pred_label]["preds"][i]["numsib"] += pred_numsib
breakdown_labels[pred_label]["preds"][i]["dist"] += pred_dist
breakdown_labels[pred_label]["preds"][i]["lcorr"] += pred_lcorr
breakdown_labels[pred_label]["preds"][i]["stepp"] += pred_stepp
# breakdown-steps
if i==0: # todo(note): only record the first one!!
breakdown_steps[pred_stepbin]["count"] += 1
breakdown_steps[pred_stepbin]["dist"] += pred_dist
breakdown_steps[pred_stepbin]["labels"][pred_label] += 1
for i2, p2 in enumerate(one_tobj.ss): # all nodes' correctness for this certain node!
breakdown_steps[pred_stepbin]["lcorrs"][i2] += p2.lcorr
# -----
# summary
data_labels = []
for k, dd in breakdown_labels.items():
gold_count = max(dd["gold"]["count"], 1e-5)
res = {"K": k, "gold_count": gold_count, "numsib": dd["gold"]["numsib"]/gold_count,
"dist": dd["gold"]["dist"]/gold_count}
for pidx, preds in enumerate(dd["preds"]):
pred_count = max(preds["count"], 1e-5)
res[f"pred{pidx}_count"] = pred_count
res[f"pred{pidx}_numsib"] = preds["numsib"]/pred_count
res[f"pred{pidx}_dist"] = preds["dist"]/pred_count
res[f"pred{pidx}_stepp"] = preds["stepp"]/pred_count
P, R = preds["lcorr"]/pred_count, preds["lcorr"]/gold_count
F = 2*P*R/(P+R) if (P+R)>0 else 0.
res.update({f"pred{pidx}_P": P, f"pred{pidx}_R": R, f"pred{pidx}_F": F})
data_labels.append(res)
data_steps = []
TOP_LABEL_K = 5
for k, dd in breakdown_steps.items():
dd_count = max(dd["count"], 1e-5)
res = {"K": k, "count": dd_count, "dist": dd["dist"]/dd_count}
for common_idx, common_p in enumerate(dd["labels"].most_common(TOP_LABEL_K)):
common_label, common_count = common_p
res[f"common{common_idx}"] = f"{common_label}({common_count/dd['count']:.3f})"
for pidx, pcorr in enumerate(dd["lcorrs"]):
res[f"pred{pidx}_acc"] = pcorr/dd_count
data_steps.append(res)
# =====
pd_labels = pd.DataFrame({k: [d[k] for d in data_labels] for k in data_labels[0].keys()})
pd_labels = pd_labels.sort_values(by="gold_count", ascending=False)
selections = ["K", "gold_count", "numsib", "dist", "pred0_numsib", "pred0_stepp", "pred0_F",
"pred1_numsib", "pred1_stepp", "pred1_F"]
pd_labels2 = pd_labels[selections]
pd_steps = pd.DataFrame({k: [d[k] for d in data_steps] for k in data_steps[0].keys()})
zlog(f"#-----\nLABELS: \n{pd_labels2.to_string()}\n\n")
zlog(f"#-----\nSTEPS: \n{pd_steps.to_string()}\n\n")
# specific table
TABLE_LABEL_K = 10
num_all_tokens = sum(z["gold_count"] for z in data_labels)
lines = []
for i in range(TABLE_LABEL_K):
ss = pd_labels.iloc[i]
fields = [ss["K"], f"{ss['gold_count']/num_all_tokens:.2f}", f"{ss['numsib']:.2f}", f"{ss['dist']:.2f}",
f"{ss['pred0_F']*100:.2f}", f"{ss['pred0_numsib']:.2f}",
f"{ss['pred1_F']*100:.2f}", f"{ss['pred1_numsib']:.2f}"]
lines.append(" & ".join(fields))
table_ss = "\\\\\n".join(lines)
zlog(f"#=====\n{table_ss}")
# -----
# import pdb
# pdb.set_trace()
return

if __name__ == '__main__':
main(sys.argv[1:])

# runnings
"""
PYTHONPATH=../../src/ python3 -m pdb run.py gold:en_dev.gold fs:en_dev.zef_ru.pred,en_dev.zg1_ru.pred
"""
23 changes: 23 additions & 0 deletions tasks/zdpar/main/build_vocab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#

# building vocabs
# the initial part of training

from ..common.confs import init_everything
from ..common.data import get_data_reader, get_multisoure_data_reader
from ..common.vocab import ParserVocabPackage

def main(args):
conf = init_everything(args+["partype:fp"])
dconf = conf.dconf
if dconf.multi_source:
_reader_getter = get_multisoure_data_reader
else:
_reader_getter = get_data_reader
train_streamer = _reader_getter(dconf.train, dconf.input_format, dconf.code_train, dconf.use_label0, cut=dconf.cut_train)
vpack = ParserVocabPackage.build_from_stream(dconf, train_streamer, []) # empty extra_stream
vpack.save(dconf.dict_dir)

# SRC_DIR="../src/"
# PYTHONPATH=${SRC_DIR}/ python3 ${SRC_DIR}/tasks/cmd.py zdpar.main.build_vocab train:[input] input_format:conllu dict_dir:[output_dir] init_from_pretrain:0 pretrain_file:?
# PYTHONPATH=${SRC_DIR}/ python3 ${SRC_DIR}/tasks/cmd.py zdpar.main.build_vocab train:../data/UD_RUN/ud24/en_all.conllu input_format:conllu dict_dir:./vocab/ init_from_pretrain:1 pretrain_file:../data/UD_RUN/ud24/wiki.multi.en.filtered.vec pretrain_scale:1 pretrain_init_nohit:1
14 changes: 9 additions & 5 deletions tasks/zdpar/stat2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#

# todo-list
# trying-list
# topical influence (use actual bert's predicted output to delete special semantics?) -> change words: first fix non-changed, then find repeated topic words, then change topic and hard words one per segment -> (191103: change too much may hurt)
# predict-leaf (use first half of order as leaf, modify the scores according to this first half?) -> (191104: still not good)
# other reduce criterion? -> (191105: not too much diff, do not get it too complex...)
# vocab based reduce? simply <= thresh? -> (191106: ok, help a little, but not our target)
# cky + decide direction later
# only change topical words?
# direct parse subword seq? (group and take max/avg-score)
# cluster (hierachical bag of words, direction?, influence range, grouped influence)
# cky + decide direction later -> (191111: still not good, but first pdb-debug; 191112: still not good)
# cky + two-layer by splitting puncts? -> (191113: only slightly helpful)
# stop words (<100 in voacb) as lower ones? -> (191114: worse than pos-rule)
# check wsj and phrase result? -> (191114: f1 around 40)
# only change topical words? -> (191115: change 883/25148, no obvious diff)
# direct parse subword seq? (group and take max/avg-score) -> (191115: max-score seems to be slightly helpful +1)
# cluster (hierachical bag of words, direction?, influence range, grouped influence) -> (191115: similar, but slightly worse than cky)
# -- ok, here is the end, goto next stage ...
Loading

0 comments on commit 842c8b7

Please sign in to comment.