Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
dingle0422 authored Jan 21, 2022
1 parent b8c3ba9 commit f6dcf28
Show file tree
Hide file tree
Showing 12 changed files with 788 additions and 111 deletions.
173 changes: 130 additions & 43 deletions HMM_Model.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,62 @@
import jieba.posseg as pseg
import pickle
import configparser
from corpus_prep import add_old_terms
from collections import defaultdict
import numpy as np
import re
import logging
import os
cwd = os.path.split(os.path.realpath(__file__))[0]
import sys
import codecs
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())

logger = logging.getLogger(__name__)
logger.setLevel(level = logging.INFO)
handler = logging.FileHandler(os.path.join(cwd, r"logs/log.log"),encoding="utf-8")
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


# 是否使用停用词
cwd = os.path.split(os.path.realpath(__file__))[0]
conf = configparser.ConfigParser()
conf.read(os.path.join(cwd , r"config.txt"))
keep_stopwords = conf.getboolean("strategy","keep_stopwords")
# 加入术语
datapath = os.path.join(cwd,"data")
if conf.get("data_name","old"):
add_old_terms(filepath= os.path.join(datapath,conf.get("data_name","old"))) # tax_term


# 导入矩阵
e_mat = pickle.load(open(r"./data/e_mat.pkl", 'rb'))
t_mat = pickle.load(open(r"./data/t_mat.pkl", 'rb'))
e_mat = pickle.load(open(os.path.join(cwd, r"data/e_mat.pkl"), 'rb'))
t_mat = pickle.load(open(os.path.join(cwd, r"data/t_mat.pkl"), 'rb'))


def tokenizer(string: str) -> list:
def tokenizer(string: str, keep_stopwords = keep_stopwords) -> list:
'''先用jieba对原字符串进行切词
'''
with open(r"./data/stopwords_chi.txt", 'r', encoding= 'utf8') as sw:
with open(os.path.join(cwd,r"data/stopwords_chi.txt"), 'r', encoding= 'utf8') as sw:
stopwords = [i.strip() for i in sw.readlines()]

string = ",".join([i for i in re.findall(r'[\u4e00-\u9fa5]+', string)])

if keep_stopwords:
# print([i for i in pseg.lcut(string) if i.word not in stopwords])
return [i for i in pseg.lcut(string) if i.word not in stopwords]
return [i for i in pseg.lcut(string)]
else:
return [i for i in pseg.lcut(string) if i.word not in stopwords]


def viterbi(string) -> list:
def viterbi(string, one_sentence = True) -> list:
'''利用viterbi算法,求一个字符串的最佳词性标注路径
string: 字符串
e_mat: 发射概率矩阵(显层)
t_mat: 转化概率矩阵(隐层)
one_sentence: 被调用时,为单个句子或txt文件
'''
global e_mat, t_mat
pairs = tokenizer(string)
Expand All @@ -38,39 +70,55 @@ def viterbi(string) -> list:


state_num = {"s":0 ,"sn":1, "b":2, "e":3, "m":4, 'c':5}

# T1记录每个时刻隐状态最大概率,T2记录t时刻是由t-1的哪个隐状态转化过来的
T1,T2 = np.zeros((len(state_num), len(word))), np.zeros((len(state_num), len(word))) # 横轴state,纵轴observation
for pos in range(len(word)-1):
t_0_launch = x_y_dict[pos]
t_1_launch = x_y_dict[pos + 1]
# 因为我们要找的是每后面一步的最优路径,所以我们要从后一步的隐层开始遍历
for k1,v1 in t_1_launch.items():
for k1,v1 in t_1_launch.items(): # states : log-prob
max_p = -np.inf # 对比更新最大概率
prob_l = [] # 保存指向同一state的不同路径概率
mark_l = [] # 保存概率对应的标记
for k0,v0 in t_0_launch.items():
if pos == 0:
if max_p < v0 + t_mat[k0][k1] + v1:
prob_l.append(v0 + t_mat[k0][k1] + v1) # 头层发射概率 + 下一层转换概率 + 下一层发射概率
if max_p < v0 + t_mat[k0].get(k1, -np.inf) + v1:
prob_l.append(v0 + t_mat[k0].get(k1, -np.inf) + v1) # 头层发射概率 + 下一层转换概率 + 下一层发射概率
mark_l.append(k0)

else:
if max_p < T1[state_num[k0], pos] +t_mat[k0][k1] + v1:
prob_l.append(T1[state_num[k0], pos] + t_mat[k0][k1] + v1)
if max_p < T1[state_num[k0], pos] +t_mat[k0].get(k1, -np.inf) + v1:
prob_l.append(T1[state_num[k0],pos] + t_mat[k0].get(k1, -np.inf) + v1) # 前面最优路径概率 + 下一层转换概率 + 下一层发射概率
mark_l.append(k0)



# 找到这一步的最优路径,保存概率和位置信息
if not prob_l:
T1[state_num[k1],pos + 1] = -np.inf
T2[state_num[k1],pos + 1] = None
else:
T1[state_num[k1],pos + 1] = max(prob_l)
T2[state_num[k1],pos + 1] = state_num[mark_l[prob_l.index(max(prob_l))]]




lastcol = list(T1[:,-1])
# print(T1,T2)
# print(lastcol)
assert max(lastcol) != -np.inf, "No sequent path can be found in string: {}".format(string)

if max(lastcol) == -np.inf:
if one_sentence:
logger.info("No sequent path can be found in string: {} \n The words tokenized by jieba will be return".format(string))
print( "No sequent path can be found in string: {}".format(string))
return pairs
else:
assert max(lastcol) != -np.inf, "No sequent path can be found in string: {}".format(string)

lastmark = lastcol.index(max(lastcol))

# print(T2)

path = []
def find_path(lastmark, pos):
if pos == -len(word)-1:
Expand All @@ -87,53 +135,92 @@ def find_path(lastmark, pos):



def get_terms_vtb(string, single = False):
def get_terms_vtb(string, one_sentence, single = False):
'''从标注完的字符串中,找到组合起来的领域新词
string: 字符串
single: 是否将未组合的单个名词也加入新词组中
single: 是否将未组合的单个名词sn也加入新词组中
one_sentence: 被调用时,为单个句子或txt文件
'''
r = viterbi(string)

r = viterbi(string, one_sentence)

if isinstance(r, list):
return r

pairs = r[1]
marks = r[0]
if not marks:
print("This sentence can not be decomposed: \n",string)
return

# print(pairs)
pairs= [[pairs[i].word, marks[i]] for i in range(len(pairs))]
b = False
m = False
new_words = []
word_parts = []
# print(pairs)
# b = False
# m = False
new_words = []
for i in range(len(pairs)):
if pairs[i][1] == "b" and not b: # b
b = True

if pairs[i][1] == "b":# and pairs[i-1][1] not in ["b","m"]: # b
word_parts = []
word_parts.append(pairs[i][0])
for ii in range(i+1, len(pairs)):

if pairs[ii][1] == 'm':
word_parts.append(pairs[ii][0])

elif pairs[ii][1] == 'e':
word_parts.append(pairs[ii][0])
new_words.append("".join(word_parts))
break # 取到结尾就结束,寻找下一个begin

# else: # 如果下一步没有任何其他M\E,则B单独成词
# new_words.append("".join(word_parts))
# break
else: # 下一步就断开了,直接结束,寻找下一个begin !!! 原版
break

# b = True
# word_parts.append(pairs[i][0])

if pairs[i][1] == "m":
# if i == (len(pairs) - 1): # bm,
# word_parts.append(pairs[i][0])
# new_words.append("".join(word_parts))
# else:
m = True
word_parts.append(pairs[i][0]) # bm / ,me
# elif pairs[i][1] == "m" and i != 0:
# if pairs[i-1][1] in ["b",'m']:
# m = True
# word_parts.append(pairs[i][0]) # bm / ,me

if pairs[i][1] == 'e' and (b or m): # be / ,me
word_parts.append(pairs[i][0])
new_words.append("".join(word_parts))
# elif pairs[i][1] == 'e' and i != 0:
# if pairs[i-1][1] in ["b",'m'] and (b or m): # be / ,me
# word_parts.append(pairs[i][0])
# new_words.append("".join(word_parts))

b = False
m = False
word_parts = []

if single == True: # only filter out the component terms --- BME/BE/BM
# b = False
# m = False
# word_parts = []

# else:


if single: # only filter out the component terms --- BME/BE/BM
for i in range(len(pairs)):
if pairs[i][1] == "sn":
new_words.append(pairs[i][0])

assert new_words, "There is no new term can be found in string: {}".format(string)

if not new_words:
if one_sentence:
logger.info("There is no new term can be found in string: {} \n The words tokenized by jieba will be return".format(string))
print( "There is no new term can be found in string: {}".format(string))
return pairs
else:
assert new_words, "There is no new term can be found in string: {}".format(string)


return list(set(new_words))





if __name__ == "__main__":
a = get_terms_vtb('通用机打发票的有效期')
print(a)
a = get_terms_vtb('企税月,季,报自动生成的数据不正确或获取不到,如何处理', one_sentence= True)
print(a)

11 changes: 5 additions & 6 deletions Matrix_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@
import numpy as np
import configparser
import os

cwd = os.path.split(os.path.realpath(__file__))[0]
# 初始化一些数据
os.system('python corpus_prep.py')
conf = configparser.ConfigParser()
conf.read(r"./config.txt")
corp = pickle.load(open(r"./data/{}".format(conf.get("data_name","corp_file")), 'rb'))
conf.read(os.path.join(cwd, r"config.txt"))
corp = pickle.load(open(os.path.join(cwd, r"data/{}".format(conf.get("data_name","corp_file"))), 'rb'))
w_att = set([o[1] for i in corp for o in i]) # 所有词性种类 -> set


# 头部隐层(词性)概率
def head_mat(corp):
'''将已经标号BMES词性的语料进行3类矩阵的构建
'''将已经标号BMES词性的语料进行3类矩阵的构建c
corp: BMES语料 -> list of list
'''
h_head = defaultdict(int)
Expand Down Expand Up @@ -80,6 +80,5 @@ def emit_mat(corp):
t = trans_mat(corp)
e = emit_mat(corp)
for n,m in zip(("t","e"),(t,e)):
pickle.dump(m, open(r"./data/{}_mat.pkl".format(n), "wb"))

pickle.dump(m, open(os.path.join(cwd, r"data/{}_mat.pkl".format(n)), "wb"))
print(t)
16 changes: 10 additions & 6 deletions config.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
[data_name]
old = tax_terms.txt
new = df_question_0.pk
corp_col = question
corp_file = clean_corp_2022-1-5.pkl

[data_name]
old = tax_terms.txt
new = question_add_hf.pkl
corp_col = question
corp_file = clean_corp_2022-1-21.pkl

[strategy]
keep_stopwords = False
threshold = 3

Loading

0 comments on commit f6dcf28

Please sign in to comment.