dna_classification.py

# -*- coding: utf-8 -*-
"""dna-classification.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1xa8IiWF0IHX-x54GDcVR1aRKKlhtjDsv

# DNA Multi Class Classification

## Prepare Google Drive
"""

# Run this cell to mount your Google Drive.
# from google.colab import drive
# drive.mount('/content/drive')

my_drive_path = '/content/drive/My Drive/'

import os

local_path = './'

"""## Prepare fastai"""

# Make ready for fast.ai
# !curl -s https://course.fast.ai/setup/colab | bash
# !pip -q install git+https://github.com/fastai/fastai.git -U

from fastai import *
from fastai.text import *

"""## Prepare Dataset"""

local_project_path = local_path + 'dna-10class/'

if not os.path.exists(local_project_path):
  os.makedirs(local_project_path)


print('local_project_path:', local_project_path)

# Downloading Reuters
reuters_url = 'https://github.com/panaali/DNA-classfication/raw/master/combined.csv'
# download and unzip file
import urllib.request
# urllib.request.urlretrieve(reuters_url , local_project_path + 'combined.csv')

"""## Create Language Model"""

# df = pd.read_csv(local_project_path + 'combined.csv')
# example_text = df.iloc[0]['Text']; example_text

class dna_tokenizer(BaseTokenizer):
  def tokenizer(slef, t):
    return list(t)

tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[''], n_cpus=1)
# tok = dna_tokenizer('dna')
# print(' '.join(tokenizer.process_text(example_text, tok=tok)))

# batch size
bs = 64


# import pdb; pdb.set_trace()
data_lm = TextLMDataBunch.from_csv(local_project_path, 'combined.csv', text_cols ='Text', valid_pct= 0.1, tokenizer=tokenizer, include_bos= False, include_eos=False)

print(data_lm.train_ds[0][0])

print(data_lm.vocab.itos)

exit()


"""## Create Language Model Learner"""

learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)#.to_fp16()

learn.lr_find()
learn.recorder.plot(skip_end = 15)

learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))

learn.save('lm-first')

learn.load('lm-first')

learn.unfreeze()

learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))

learn.save('lm-fine-tuned')

learn.load('lm-fine-tuned');

TEXT = "Marathon Petroleum Co said it reduced"
N_WORDS = 40
N_SENTENCES = 2
print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

learn.save_encoder('fine_tuned_enc')

learn.load_encoder('fine_tuned_enc')

"""## Create Classifier"""

from io import StringIO

csv_string = ''
fp = open(local_project_path + 'cats.txt')
for line in fp:
    filename = local_project_path + line.split(' ', maxsplit=1)[0]
    labels = line.split(' ', maxsplit=1)[1].strip()
    csv_string += filename + ',' + labels + '\n'

print(csv_string[:100])
label_df = pd.read_csv(StringIO(csv_string), header=None)
label_df.head()

def label_func(filename):
  return label_df.iloc[label_df[0] == filename]

textlist = (TextList.from_folder(local_project_path)
                   .filter_by_folder(include=['training', 'test'])
                   .split_by_rand_pct(0.1)
                   .label_from_func(label_func))
data_lm = textlist.databunch(bs = bs)

data_lm.show_bunch()