-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinstructor-script.R
210 lines (145 loc) · 6.21 KB
/
instructor-script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
##################################################
# R Programming and data Analysis Boot Camp
# 2019-Feb-22
#
# The sample data was extracted from a DNA methylation dataset at:
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52588
##################################################
# SESSION 3
##################################################
# Import Excel data.
# WARNING: Your path may be different
library(readxl)
mtable <- read_excel("~/Desktop/r-course/r-sample-data.20190222.xlsx", skip = 5)
# Alternatively, choose the file interactively each time
# mtable <- read_excel(file.choose(), skip = 5)
##################################################
# examine class and content of the data
summary(mtable)
mtable
class(mtable)
# force tibble into the old-style data frame if you wish
mdf = data.frame(mtable)
summary(mdf)
mdf
class(mdf)
##################################################
# try boxplot options
boxplot(mdf) # ERROR: first 2 columns are not numeric
# boxplot after removing the 1st and 2nd columns
boxplot(mdf[ , 3:89])
boxplot(mdf[ , 3:ncol(mdf)])
boxplot(mdf[ , -1:2]) # ERROR: invalid idexing
boxplot(mdf[ , -(1:2)])
mdata = mdf[ , -(1:2)]
boxplot(mdata)
##################################################
# Other visualization options
boxplot(mdata, col = 'grey')
boxplot(mdata, col = 'grey', las = 0)
boxplot(mdata, col = 'grey', las = 1)
boxplot(mdata, col = 'grey', las = 2)
boxplot(mdata, col = 'grey', las = 3)
?boxplot # Help page
# create a function
my_boxplot = function(x) {
boxplot(mdata, col = 'grey', las = x)
}
my_boxplot(2)
my_boxplot(3)
##################################################
# Issues with variable scope and assignment operator "<-"
x=5 # global variable x, UNrelated to the parameter x in my_boxplot
my_boxplot(x=2) # here x is the parameter of the function my_boxplot
my_boxplot(x<-2) # WARNING: global variable x is overridden with a new value
# In the previous line, the assignment operator "<-" was interpreted
# in the global scope, which may be completely unintended and a source of BUGS.
# In essence, the value '2' was first assigned to the global variable x;
# then the result of this operation was passed as a parameter to the function.
##################################################
# Another example of BUGS with "<-" if spaces are not used properly:
# The intended behaviour is to check if x is less than -1, as in: "x < -1"
# The actual behaviour is a BUG assigning the value 1 to x, as in: "x <- 1".
x=5
if(x<-1) {
print("Weird")
} else {
print("OK")
}
##################################################
# SESSION 4
##################################################
# collection of numbers
x = 1:10 # sequence of consecutive integers
x = seq(from=1, to=10, by=4) # same but using sequence function
x = c(1,2,3,4,5,6,7,8,9,10) # same but using concatenation function
x = c(1,2,37,4,NA,6,73,8,9,100) # can concatenate any non-sequential values
# applying an operation to every element of a vector
x=1:10
sqrt(x) # sqrt of every element - dump on the console screen
x^2 # square of every element - dump on the console screen
y = x^2 # square of every element - save in a new vector y
x = x^2 # square of every element - save in the same vector x, overriding it
mean(x) # computes a single value, not element-by-element
# generic way of applying an operation to every element of a vector
x=1:10
sqrt(x) # use a pre-existing function sqrt
sapply( x, sqrt) # use a pre-existing function sqrt
sapply( x, function(k) sqrt(k)) # use a pre-existing function sqrt
sapply( x, function(k) { sqrt(k) + 1 - log10(k) } ) # define a new function
# define a new function separately, give it a name, then use it by name
heavy_computation = function(k) {
sqrt(k) + 1 - log10(k)
}
sapply( x, heavy_computation )
# Assuming some functions are defined in a file 'my_functions.R', load them:
#source('my_functions.R')
##################################################
# force your data into a matrix object
mm = data.matrix(mdata)
class(mm)
# Pearson correlations between pairs of matrix columns (or samples)
cor( mm[,1] , mm[,2])
cor( mm[,1] , mm[,50] )
cor( mm[,50] , mm[,51] )
# Show scatter plots between pairs of matrix columns (or samples)
plot( mm[,1] , mm[,2] )
plot( mm[,1] , mm[,50] )
plot( mm[,50] , mm[,51] )
# Find correlations of column #1 to all other columns of the matrix
sapply( 2:87 , function(k) cor( mm[,1] , mm[,k]) )
corr_to_col1 = function(k) { cor( mm[,1] , mm[,k]) }
sapply( 2:87 , corr_to_col1) # first give a name to the function, then use it.
##################################################
# Matrix of all pairwise correlations between all columns of mm
corr_mm = cor(mm)
heatmap( corr_mm ) # show a basic heatmap
# WARNING: Need to install the package 'pheatmap' first
library('pheatmap')
pheatmap( corr_mm ) # show a pretty heatmap
##################################################
# Operations of the column labels
colnames(mm) # show all column names
grep('c', colnames(mm)) # indices of the control columns
grepl('c', colnames(mm)) # logical values whether the column are controls
grep('c', colnames(mm), value = TRUE) # actual names of the control columns
cols_control = grep('c', colnames(mm), value = TRUE) # control sample names
cols_disease = grep('d', colnames(mm), value = TRUE) # disease sample names
##################################################
# Testing the difference between control vs disease in gene #1 and gene #2
t.test ( mm[ 1, cols_control] , mm[ 1, cols_disease])
tt = t.test ( mm[ 2, cols_control] , mm[ 2, cols_disease])
summary(tt)
class(tt)
str(tt)
tt$p.value
# Testing the difference between control vs disease in all genes at once.
# Use a vector of row indices, extracting one index at a time.
# For each index 'k', apply a t-test as before.
sapply(1:132 , function(k) t.test ( mm[k, cols_control] , mm[k, cols_disease]) $ p.value)
# This function extracts the actual rows themselves, rather than their indices.
# For each row (which is a vector of numbers), apply a t-test.
apply(mm , 1, function(row) t.test ( row[cols_control] , row[cols_disease]) $ p.value)
##################################################
# END OF SCRIPT
##################################################