-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy path.Rhistory
109 lines (109 loc) · 4.67 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
library(nsga2R)
library(doParallel)
library(MLmetrics)
library(ggplot2)
library(parallel)
library(randomForest)
library(GenAlgo)
library(caret)
library(stringr)
library(devtools)
library(Metrics)
library(iRF)
library(cluster)
library(doSNOW)
library(doMC)
library(purrr)
library(dplyr)
library(SGL)
library(e1071)
library(MLmetrics)
library("e1071", lib.loc="~/R/win-library/3.1")
registerDoMC()
set.seed(100)
#Function used to set up paralell computing
resetCluster <- function(){
stopCluster(cl)
cores = detectCores(all.tests = FALSE, logical = TRUE)
cl<-makeCluster(cores-1, outfile="Log.txt")
registerDoSNOW(cl)
}
cores = detectCores(all.tests = FALSE, logical = TRUE)
#Reading data sources
data = load('data/completeDataset.csv')
#Reading data sources
data = load('data/completeDataset.RDS')
data <- data[-nearZeroVar(data)] #Here we remove any variables where the variance is close to zero
#Reading data sources
data = load('data/completeDataset.RDS')
#Reading data sources
data = readRDS('data/completeDataset.RDS')
data <- data[-nearZeroVar(data)] #Here we remove any variables where the variance is close to zero
expressionData = readRDS('data/expressionOnly.RDS')
metabolicExpression = readRDS('data/metabolic_gene_data.RDS')
biomassGrowthFlux = 'r_4041';
biomassIndex = 717;
growth = data$log2relT #Extract the growth rates
strains = data$Row #Extract the strains names
data$Row <- NULL
data$log2relT <- NULL #
expressionData[,1] <- NULL #Remove the growth rate data
iRFIndicies = read.csv('data/Features_Extracted_Using_iRF.csv', header = F)
SGLIndicies = read.csv('data/Features_Extracted_Using_SGL.csv', header = F)
GeneticAlgoIndicies = read.csv('data/genetic_feature_selection_features.csv', header = FALSE, stringsAsFactors=FALSE)
iRFData = data[, which(colnames(data) %in% unlist(iRFIndicies[,1])) ]
SGLData = data[, which(colnames(data) %in% unlist(SGLIndicies[,1]))]
fluxData <- data[, -which(colnames(data) %in% colnames(expressionData))] #flux data is all data without gene data
#Full data partitions
testingIndices<- unlist(read.csv('testing_index.csv', header = F))
testingIndices <- testingIndices + 1 # to account for index changes between R and Python
testingTarget <- growth[testingIndices]
trainingTarget <- growth[-testingIndices]
trainingData <- data[-testingIndices,]
testingData <- data[testingIndices,]
normParam <- preProcess(trainingData)
normalisedTestData <- predict(normParam, testingData)
normalisedTrainingData <- predict(normParam, trainingData)
#Expression data partition
expressionTrainingData <- expressionData[-testingIndices,]
expressionTestingData <- expressionData[testingIndices,]
normParamExp <- preProcess(expressionTrainingData)
normalisedExpressionTrainingData <- predict(normParamExp, expressionTrainingData)
normalisedExpressionTestingData <- predict(normParamExp, expressionTestingData)
#Flux data partition
fluxTrainingData <- fluxData[-testingIndices,]
fluxTestingData <- fluxData[testingIndices,]
normParamFlux <- preProcess(fluxTrainingData)
normalisedFluxTrain <- predict(normParamFlux, fluxTrainingData)
normalisedFluxTest <- predict(normParamFlux, fluxTestingData)
#Metabolic expression data partition
metabolicExpressionTrainingData <- metabolicExpression[-testingIndices,]
metabolicExpressionTestingData <- metabolicExpression[testingIndices,]
normParamMetabolicExpression <- preProcess(metabolicExpressionTrainingData)
normalisedMetabolicExpressionTrain <- predict(normParamMetabolicExpression, metabolicExpressionTrainingData)
normalisedMetabolicExpressionTest <- predict(normParamMetabolicExpression, metabolicExpressionTestingData)
#SGL expression data partition
SGLTrainingData <- SGLData[-testingIndices,]
SGLTestingData <- SGLData[testingIndices,]
normParamSGL <- preProcess(SGLTrainingData)
normalisedSGLTrain <- predict(normParamSGL, SGLTrainingData)
normalisedSGLTest <- predict(normParamSGL, SGLTestingData)
#iRF expression data partition
iRFTrainingData <- iRFData[-testingIndices,]
iRFTestingData <- iRFData[testingIndices,]
normParamiRF <- preProcess(iRFTrainingData)
normalisediRFTrain <- predict(normParamiRF, iRFTrainingData)
normalisediRFTest <- predict(normParamiRF, iRFTestingData)
# Genetic algo features, need to check the normalisation is correct - come back to this
gen_features_index <- read.csv('data/genetic_feature_selection_features.csv', header = F)
normGenTrain <- vector(mode = "list", length = dim(gen_features_index)[1] - 1)
normGenTest <- vector(mode = "list", length = dim(gen_features_index)[1] - 1)
for (j in c(1:9)){
features <- word(unlist(gen_features_index[,j]))
print(features)
tr = data[-testingIndices, features]
te = data[testingIndices, features]
normParam <- preProcess(tr)
normGenTrain[[j]] = predict(normParam, tr)
normGenTest[[j]] = predict(normParam, te)
}