-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
111 lines (84 loc) · 3.08 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from HMM_Model import get_terms_vtb, tokenizer
from word_filter import overlap_out
import pickle
import os
import logging
import configparser
logger = logging.getLogger() # 创建日志对象
logger.root.setLevel(logging.INFO) # 设置可输出日志级别范围
# console_handler = logging.StreamHandler() # 将信息输出到控制台
file_handler = logging.FileHandler(filename= "./logs/log.log", encoding = 'utf8') # 将信息保存到log文
# 设置格式并赋予handler
formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
# console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)
# 将handler添加到日志器中
# logger.addHandler(console_handler)
logger.addHandler(file_handler)
cwd = os.path.split(os.path.realpath(__file__))[0]
conf = configparser.ConfigParser()
conf.read(os.path.join(cwd , r"config.txt"))
keep_stopwords = conf.getboolean("strategy","keep_stopwords")
def new_att(sentence, word):
"""
将不同的词性转化为新词的n/nv词性
sentence: 该新词所在原语料
word: 新词
"""
w_att = tokenizer(sentence, keep_stopwords = keep_stopwords)
a = set()
for i in w_att:
if i.word in word:
a.add(i.flag)
att = set()
for i in a:
if "l" in i:
att.add("n")
elif "n" in i:
att.add("n")
elif "v" in i:
att.add("v")
if "n" in att and "v" in att:
return "nv"
if len(att) == 1 and "n" in att:
return "n"
def get_terms(filepath, n_only = True, overlaps = False,one_sentence = False):
"""
返回最终发现的新词
filepath: 需要进行新词挖掘的文件路径.pkl
one_sentence: 被调用时,为单个句子或txt文件
"""
t = pickle.load(open(filepath,'rb'))
res = []
for s in t:
try:
new_word = get_terms_vtb(s, one_sentence)
for nw in new_word:
NW = [nw, new_att(s, nw)]
if NW not in res: # 防止一句话里多个相同词的提取
res.append(NW)
except Exception as E:
logger.info(E)
if n_only == True:
e = []
for i in res:
if i[1] == "n":
e.append(i)
if overlaps:
return e
else:
return overlap_out(e)
else:
if overlaps:
return res
else:
return overlap_out(res)
if __name__ == "__main__":
# # with open(r"./Mining/test_corp.txt", 'r', encoding= 'utf8') as t:
cwd = os.path.split(os.path.realpath(__file__))[0]
filepath = os.path.join(cwd , r"Mining/TTT.pkl")
res = get_terms(filepath, n_only = True, overlaps= False)
print(res,len(res))
pickle.dump(res, open(os.path.join(cwd ,r"Mining/NT.pkl"), 'wb'))
# a = pickle.load(open(os.path.join(cwd ,r"Mining/NT.pkl"), 'rb'))
# print(get_terms_vtb("选择", one_sentence=False, single= True))