-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdna_classification_vcu_predictor.py
67 lines (47 loc) · 3.17 KB
/
dna_classification_vcu_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""dna-classification-vcu.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/13BLlKCJc5EiKMHP7KEH5hOXyLEIUdX02
# DNA Multi Class Classification
"""
"""## Prepare Google Drive"""
# Run this cell to mount your Google Drive.
local_path = './'
"""## Prepare fastai"""
from fastai import *
from fastai.text import *
"""## Prepare Dataset"""
local_project_path = local_path + 'dna-10class/'
if not os.path.exists(local_project_path):
os.makedirs(local_project_path)
print('local_project_path:', local_project_path)
"""## Create Language Model"""
class dna_tokenizer(BaseTokenizer):
def tokenizer(slef, t):
return list(t)
tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[])
processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos= False, include_eos=False), NumericalizeProcessor(max_vocab=30000)]
# batch size
bs = 64
data_lm = TextLMDataBunch.from_csv(local_project_path, 'combined.csv',
text_cols ='Text', valid_pct= 0.1, tokenizer=tokenizer,
include_bos= False, include_eos=False)
# learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3, pretrained=False).to_fp16()
# learn.load('lm-fine-tuned-10-6')
# learn.save_encoder('lm-fine-tuned-10-6-encoder')
"""## Create Classifier"""
# data_cls = TextClasDataBunch.from_csv(local_project_path, 'combined_train.csv',
# text_cols ='Text', label_cols ='class', valid_pct= 0.1, tokenizer=tokenizer,
# include_bos= False, include_eos=False, vocab = data_lm.vocab)
data_cls = (TextList.from_csv(local_project_path, 'combined.csv', cols='Text', vocab=data_lm.vocab, processor= processor)
.split_from_df(col='is_test')
.label_from_df(cols='class')
.databunch(bs=bs))
print('data_cls validation set size', len(data_cls.valid_ds))
# data_cls.show_batch()
learn_cls = text_classifier_learner(data_cls, AWD_LSTM, drop_mult=0.3, pretrained=False).to_fp16()
learn_cls.load('fifth')
dna_string = 'atggcagtggcaggtaaaaatgactttgcagttctcaacaccgggcggaagatgcctctccttgggctgggaacatggaagagtgaacctggcaaggttaaacaggcagtaatctgggccttgcaggctggctaccgccacttcgactgtgctgccatctatggcaacgagttggagatcggagaagctctgcaggagacacttggccctgacaaagccttgaggcgagaggatgtgtttatcacctccaagctgtggaacacacagcatcacccggaggatgtggagcccgctctgctgaagacactgaaggagctgagtctggaatacctggatctatacctcatccactggccctatgccttccaacaaggtgacgctcctttccccaaatcggaggacggcaccctgctgtacgacgacatcgactacaagctgacttgggctgccatggagaagctggtgggaaagggcctggtcagggctatcggcctgtccaacttcaacagcaaacagatagacaacgttctctccgtagccaacatcaaaccgactgtgcttcaggtggaaagccatccgtatctggctcaggtggagttgctgggacactgccgggacagaggcctggtgattacagcgtacagcccactggggtcaccggatcgggtatggaagcatcctgatgagcccgtcctcctggatgaagcagcaatcgacaccctggccaagaagtacaacaagtccccagcacaaatcatccttagatggcagacacagcgaggagtagtgacgatccctaaaagtgtgacagagtctcggatcaaagagaatattcaggtatttgactttacccttgaagcggaagagatgaagtgtataacagcattgaacagaggctggcgctacattgtaccaaccatcacagttgatgggaagcccgtccccagggatgcaggacatccacactaccccttcagtgacccctactga'
res = learn_cls.predict(dna_string)
print(res)