-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_features.py
71 lines (53 loc) · 2.38 KB
/
generate_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from features.pos_tags_features import POSTagFeatures
from features.sentence_struct_features import SentenceStructureFeatures
from features.sentence_sentm_features import SentenceSentimentFeatures
from features.sentence_word_emb import GloVeFeatures
from tqdm import tqdm
import pandas as pd
import traceback
from utils.data_preprocess import generate_data
from logger import Logger
pos_extractor = None
structure_extractor = None
def generate_training_data(dataset, file_to_save, logger):
try:
pos_extractor = POSTagFeatures(logger)
sentiment_extractor = SentenceSentimentFeatures(logger)
structure_extractor = SentenceStructureFeatures(logger)
glove_extractor = GloVeFeatures(logger)
features_list = []
logger.log("Start calculating features ...")
for row in tqdm(dataset):
crt_feats = []
crt_sentence = row['postText'][-1]
crt_feats = [row['id']]
crt_feats += pos_extractor.compute_features_per_sentence(crt_sentence)
crt_feats += sentiment_extractor.compute_features_per_sentence(crt_sentence)
crt_feats += structure_extractor.compute_features_per_sentence(crt_sentence)
crt_feats += glove_extractor.compute_features_per_sentence(crt_sentence)
features_list.append(crt_feats)
except:
traceback.print_exc()
logger.log("Error generated at {}".format(row['postText'][-1]))
pos_extractor.core_nlp.close_server()
structure_extractor.core_nlp.close_server()
exit(-1)
logger.log("Finish calculating {} features for {} entries".format(
len(features_list[-1]) -1, len(features_list)), show_time = True)
colnames = ["ID"]
colnames += pos_extractor.end_computing_features()
colnames += sentiment_extractor.end_computing_features()
colnames += structure_extractor.end_computing_features()
colnames += glove_extractor.end_computing_features()
df = pd.DataFrame(features_list, columns = colnames)
logger.log("Features dataframe snippet \n {}".format(df.head()))
file_to_save = logger.get_data_file(file_to_save)
logger.log("Save features to {}".format(file_to_save))
df.to_csv(file_to_save, index = False)
return df
# "small", "large", "custom"
DATA_TYPE = "custom"
if __name__ == '__main__':
logger = Logger(show = True, html_output = True, config_file = "config.txt")
dataset = generate_data(DATA_TYPE, logger)
df = generate_training_data(dataset, DATA_TYPE + "_train.csv", logger)