Skip to content

Commit

Permalink
Add some other parsers.
Browse files Browse the repository at this point in the history
  • Loading branch information
zzsfornlp committed Jan 19, 2020
1 parent ed43cc5 commit 5f7d454
Show file tree
Hide file tree
Showing 123 changed files with 16,726 additions and 915 deletions.
28 changes: 19 additions & 9 deletions msp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
#

# The Mingle Structured Prediction (v0plus) package
# The Mingled Structured Prediction (v0plus) package
# by zzs (from 2018.02 - now)

# dependencies: pytorch, numpy, scipy, gensim, cython
# dependencies: pytorch, numpy, scipy, gensim, cython, pybind11
# conda install pytorch numpy scipy gensim cython pybind11

VERSION_MAJOR = 0
VERSION_MINOR = 0
VERSION_MINOR = 1
VERSION_PATCH = 1

VERSION_STATUS = "dev"

# TODO(!)
# nn optimizer / param groups?
# check nn module (for simplification?)
# new model/training/testing scheme -> make it more modularized
# Conf's init: what types
# dropout setting: use training/testing(with/wo)-mode
# easy-to-use calculations result-representing tools for analysis
# various tools for python as the replacement of direct bash shell
# gru and cnn have problems?

def version():
return (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)

return (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH, VERSION_STATUS)

__version__ = ".".join(str(z) for z in version())

Expand All @@ -26,9 +36,9 @@ def version():
# !!: (again-corrected) The goal is not to build a whole framework, but several useful pieces.

# conventions
# todo(0)/todo(warn): simply warning
# todo(+N): need what level of efforts
# TODO(!): unfinished, real todo
# todo(0)/todo(warn)/todo(note): simply warning or noting
# todo(+N): need what level of efforts, +N means lots of efforts
# TODO: unfinished, real todo

# hierarchically: msp -> scripts / tasks, no reverse ref allowed!

Expand Down
2 changes: 2 additions & 0 deletions msp/cmp/zarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import numpy as np

# todo(warn): should use pandas!

class ZArray:
def __init__(self, arr: np.ndarray, names=None):
self.arr = arr
Expand Down
34 changes: 34 additions & 0 deletions msp/cmp/ztab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#

#
class ZTab:
@staticmethod
def read(ss, fsep='\t', lsep='\n'):
pieces = [z.split(fsep) for z in ss.split(lsep)]
num_col = len(pieces[0]) - 1
num_row = len(pieces) - 1
row_names = [z[0] for z in pieces[1:]]
col_names = pieces[0][1:]
ret = {}
# add entries
for one_row in pieces[1:]:
k = one_row[0]
assert k not in ret
cur_m = {}
assert len(one_row)-1 == num_col
for n,v in zip(col_names, one_row[1:]):
cur_m[n] = v
ret[k] = cur_m
return row_names, col_names, ret

@staticmethod
def write(row_names, col_names, value_f, fsep='\t', lsep='\n'):
pieces = []
# add head line
pieces.append([""] + col_names)
# add each rows
for r in row_names:
pieces.append([r] + [value_f(r, c) for c in col_names])
# finally concat
ss = lsep.join([fsep.join(z) for z in pieces])
return ss
2 changes: 1 addition & 1 deletion msp/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .data import Instance, TextReader, FdReader, WordNormer
from .vocab import Vocab, VocabHelper, VocabBuilder, WordVectors, VocabPackage, MultiHelper
from .streamer import Streamer, AdapterStreamer, FAdapterStreamer, FileOrFdStreamer, BatchArranger, InstCacher, \
MultiCatStreamer, IterStreamer, MultiZipStreamer
MultiCatStreamer, IterStreamer, MultiZipStreamer, FListAdapterStream, MultiJoinStreamer, ShuffleStreamer
94 changes: 83 additions & 11 deletions msp/data/streamer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#

from msp.utils import zopen, zcheck, zfatal, ZException
from msp.utils import zopen, zcheck, zfatal
from msp.utils import Constants, Random
from typing import Iterable, Sequence

Expand Down Expand Up @@ -147,10 +147,25 @@ def _next(self):
else:
return z

# f can returns a list (augment or filter)
# TODO(+1)
# f: Inst -> List[Inst] (augment or filter)
class FListAdapterStream(AdapterStreamer):
pass
def __init__(self, base_streamer, f):
super().__init__(base_streamer)
self.f = f
self.cache = []
self.cache_idx = 0

def _next(self):
while self.cache_idx >= len(self.cache):
one = self.base_streamer_.next()
if self.base_streamer_.is_eos(one):
return None
self.cache.clear()
self.cache_idx = 0
self.cache.extend(self.f(one))
r = self.cache[self.cache_idx]
self.cache_idx += 1
return r

# =====
# Random Streams
Expand Down Expand Up @@ -202,13 +217,6 @@ def _next(self):
if self.rs_.next():
return one

# TODO(+2)
class ShuffleStreamer(AdapterStreamer):
# -1 means read all and shuffle
def __init__(self, base_streamer, batch_size=-1):
super().__init__(base_streamer)
raise NotImplementedError()

# =====
# Multi Streamers

Expand Down Expand Up @@ -247,6 +255,45 @@ def _restart(self):
one.restart()
self._mc_set_cur(0)

#
# like MultiCat, but join the streams (horizontally) rather than concat (vertically)
# todo(note): currently mixing them 1 by 1; maybe not efficient enough if there are too many streamers
class MultiJoinStreamer(MultiStreamer):
def __init__(self, base_streamers: Sequence):
super().__init__(base_streamers)
self.cur_pointer = 0
self.ended = [False] * len(self.base_streamers_)

def _next(self):
# find the next active streamer
starting_point = self.cur_pointer
num_streamers = self.num_streamers_
cur_point = starting_point
flag_success = False
one = None
for i in range(num_streamers): # at most travel one round
if not self.ended[cur_point]:
new_streamer = self.base_streamers_[cur_point]
one = new_streamer.next()
if new_streamer.is_eos(one):
self.ended[cur_point] = True
else:
flag_success = True
cur_point = (cur_point + 1) % num_streamers
if flag_success:
break
self.cur_pointer = cur_point
if flag_success:
return one
else:
return None

def _restart(self):
for one in self.base_streamers_:
one.restart()
self.cur_pointer = 0
self.ended = [False] * len(self.base_streamers_)

#
# zip-like multi-streamer, with various modes, return a list of instances
# -- can be useful to combine with LoopStreamer for wrapping-around
Expand Down Expand Up @@ -301,6 +348,9 @@ def get(self):
def reset(self):
raise NotImplementedError()

def clear(self):
raise NotImplementedError()

def __len__(self):
raise NotImplementedError()

Expand All @@ -325,6 +375,10 @@ def get(self):
self.ptr += 1
return self.c[self.ptr-1]

def clear(self):
self.c.clear()
self.ptr = 0

def reset(self):
self.ptr = 0
if self.shuffle:
Expand All @@ -347,6 +401,24 @@ def _restart(self):
self.cache.put(one)
self.cache.reset()

# todo(+1): partial shuffle to avoid read all?
class ShuffleStreamer(AdapterStreamer):
# -1 means read all and shuffle
def __init__(self, src_stream, cache_builder=InplacedCache):
super().__init__(src_stream)
self.src = src_stream
self.cache = cache_builder(shuffle=True)

def _next(self):
return self.cache.get()

def _restart(self):
# todo(note): rebuild cache each time (do not need to call super, since for-loop will trigger the src.restart)
self.cache.clear()
for one in self.src:
self.cache.put(one)
self.cache.reset()

# =====
# BatchHelper

Expand Down
Loading

0 comments on commit 5f7d454

Please sign in to comment.