1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
|
require 'matrix'
class Model
def initialize dir
@dir = dir
@similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f}
@smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ]
@dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ]
abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size
end
def crossvalidation folds=10
start_time = Time.now
nr_instances = @independent_variables.size
indices = (0..nr_instances-1).to_a.shuffle
mid = (nr_instances/folds)
start = 0
threads = []
0.upto(folds-1) do |i|
threads << Thread.new do
t = Time.now
puts "Fold #{i} started"
# split train data
last = start+mid
last = last-1 unless nr_instances%folds > i
test_idxs = indices[start..last] || []
idxs = {
:test => test_idxs,
:train => indices-test_idxs
}
start = last+1
# write training/test data
cv_dir = File.join(@dir,"crossvalidation",i.to_s)
dirs = {}
idxs.each do |t,idx|
d = File.join cv_dir,t.to_s
dirs[t] = d
FileUtils.mkdir_p d
File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") }
File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") }
File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train
end
# predict
train_model = self.class.new dirs[:train]
train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions")
puts "Fold #{i}: #{(Time.now-t)/60} min"
end
end
threads.each(&:join)
puts "Total: #{(Time.now-start_time)/60} min"
end
def batch_predict dir, out=$stdout
prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp}
File.open(out, "w+") do |f|
File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i|
variables = line.chomp.split(",")
f.puts predict(prediction_smiles[i],variables).join(",")
end
end
end
end
module Cosine
def preprocess
puts "Feature selection"
t = Time.now
@selected = (0..@independent_variables.first.size-1).to_a
columns = Matrix[ *@independent_variables ].column_vectors
columns.each_with_index do |c,i|
next unless @selected.include? i
p "#{i}/#{@selected.size}"
# remove variables with zero variances
if c.to_a.zero_variance?
@selected.delete i
next
end
# remove correlated variables
(i+1..columns.size-1).each do |j|
next unless @selected.include? j
@selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9
end
end
@selected.sort!
p
mat = @selected.collect{|i| @independent_variables[i]}
columns = Matrix[ *mat ].column_vectors
@independent_variable_means = columns.collect{|c| c.to_a.mean}
@independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation}
scaled_columns = []
columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}}
@scaled_independent_variables = Matrix.columns(scaled_columns).to_a
p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size
puts (Time.now-t)/60
end
def predict smiles, variables
variables.collect!{|v| v.to_f}
preprocess unless @scaled_independent_variables # lazy preprocessing
selected_variables = @selected.collect{|i| variables[i]}
scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]}
similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])}
similarity_prediction smiles, similarities
end
end
module Tanimoto
def predict_smiles smiles
c = Compound.from_smiles(smiles)
predict smiles, c.fingerprint
end
def predict smiles, fingerprint
similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])}
similarity_prediction smiles, similarities
end
end
class ClassificationModel < Model
def initialize dir
super dir
abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"]
@dependent_variables = @dependent_variables.collect{|v| v.to_i}
end
def similarity_prediction smiles, similarities
neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles
return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2
neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
neighbor_similarities = neighbor_idx.collect{|i| similarities[i]}
probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities)
probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
[ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ]
end
# Weighted majority vote
# @param [Array<0,1>] neighbor_dependent_variables
# @param [Array<Float>] weights
# @return [Array] probabilities
def weighted_majority_vote neighbor_dependent_variables, weights
w = []
w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]}
w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]}
weights_sum = weights.sum.to_f
weights_max = weights.max.to_f
probabilities = []
probabilities[0] = weights_max*w[0].sum/weights_sum
probabilities[1] = weights_max*w[1].sum/weights_sum
probabilities
end
end
class RegressionModel < Model
end
class TanimotoClassificationModel < ClassificationModel
include Tanimoto
def initialize dir
super dir
@independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ]
abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size
end
end
class CosineClassificationModel < ClassificationModel
include Cosine
def initialize dir
super dir
@independent_variables = Matrix[
*File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} }
]
abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size
abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size
end
end
class TanimotoRegressionModel < RegressionModel
end
class CosineRegressionModel < RegressionModel
end
|