-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdna_classification.py
131 lines (82 loc) · 3.11 KB
/
dna_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""dna-classification.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1xa8IiWF0IHX-x54GDcVR1aRKKlhtjDsv
# DNA Multi Class Classification
## Prepare Google Drive
"""
# Run this cell to mount your Google Drive.
# from google.colab import drive
# drive.mount('/content/drive')
my_drive_path = '/content/drive/My Drive/'
import os
local_path = './'
"""## Prepare fastai"""
# Make ready for fast.ai
# !curl -s https://course.fast.ai/setup/colab | bash
# !pip -q install git+https://github.com/fastai/fastai.git -U
from fastai import *
from fastai.text import *
"""## Prepare Dataset"""
local_project_path = local_path + 'dna-10class/'
if not os.path.exists(local_project_path):
os.makedirs(local_project_path)
print('local_project_path:', local_project_path)
# Downloading Reuters
reuters_url = 'https://github.com/panaali/DNA-classfication/raw/master/combined.csv'
# download and unzip file
import urllib.request
# urllib.request.urlretrieve(reuters_url , local_project_path + 'combined.csv')
"""## Create Language Model"""
# df = pd.read_csv(local_project_path + 'combined.csv')
# example_text = df.iloc[0]['Text']; example_text
class dna_tokenizer(BaseTokenizer):
def tokenizer(slef, t):
return list(t)
tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[''], n_cpus=1)
# tok = dna_tokenizer('dna')
# print(' '.join(tokenizer.process_text(example_text, tok=tok)))
# batch size
bs = 64
# import pdb; pdb.set_trace()
data_lm = TextLMDataBunch.from_csv(local_project_path, 'combined.csv', text_cols ='Text', valid_pct= 0.1, tokenizer=tokenizer, include_bos= False, include_eos=False)
print(data_lm.train_ds[0][0])
print(data_lm.vocab.itos)
exit()
"""## Create Language Model Learner"""
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)#.to_fp16()
learn.lr_find()
learn.recorder.plot(skip_end = 15)
learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))
learn.save('lm-first')
learn.load('lm-first')
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))
learn.save('lm-fine-tuned')
learn.load('lm-fine-tuned');
TEXT = "Marathon Petroleum Co said it reduced"
N_WORDS = 40
N_SENTENCES = 2
print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))
learn.save_encoder('fine_tuned_enc')
learn.load_encoder('fine_tuned_enc')
"""## Create Classifier"""
from io import StringIO
csv_string = ''
fp = open(local_project_path + 'cats.txt')
for line in fp:
filename = local_project_path + line.split(' ', maxsplit=1)[0]
labels = line.split(' ', maxsplit=1)[1].strip()
csv_string += filename + ',' + labels + '\n'
print(csv_string[:100])
label_df = pd.read_csv(StringIO(csv_string), header=None)
label_df.head()
def label_func(filename):
return label_df.iloc[label_df[0] == filename]
textlist = (TextList.from_folder(local_project_path)
.filter_by_folder(include=['training', 'test'])
.split_by_rand_pct(0.1)
.label_from_func(label_func))
data_lm = textlist.databunch(bs = bs)
data_lm.show_bunch()