1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
class Model
def initialize dir
@dir = dir
@dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp}
@dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp
if @dependent_variable_type == "binary"
abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"]
@dependent_variables = @dependent_variables.collect{|v| v.to_i}
elsif @dependent_variable_type == "numeric"
# TODO check for floats
@dependent_variables = @dependent_variables.collect{|v| v.to_f}
end
@independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp
@independent_variables = []
@smiles = []
File.readlines(File.join(@dir,"independent_variables")).each do |line|
items = line.chomp.split(",")
@smiles << items.shift
items.collect!{|v| v.to_f} if @independent_variable_type == "numeric"
@independent_variables << items
end
@similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f}
end
def crossvalidation folds=10
start_time = Time.now
nr_instances = @independent_variables.size
indices = (0..nr_instances-1).to_a.shuffle
mid = (nr_instances/folds)
start = 0
0.upto(folds-1) do |i|
t = Time.now
print "Fold #{i}: "
# split train data
last = start+mid
last = last-1 unless nr_instances%folds > i
test_idxs = indices[start..last] || []
idxs = {
:test => test_idxs,
:train => indices-test_idxs
}
start = last+1
# write training/test data
cv_dir = File.join(@dir,"crossvalidation",i.to_s)
dirs = {}
idxs.each do |t,idx|
d = File.join cv_dir,t.to_s
dirs[t] = d
FileUtils.mkdir_p d
File.open(File.join(d,"independent_variables"),"w+") do |f|
idx.each do |i|
f.print "#{@smiles[i]},"
f.puts @independent_variables[i].join(",")
end
end
File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
if t == :train
File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type }
File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type }
File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") }
end
end
# predict
train_model = self.class.new dirs[:train]
train_model.predict_fold File.join(dirs[:test],"independent_variables")
puts Time.now-t
end
puts "Total: #{Time.now-start_time}"
end
end
class ClassificationModel < Model
def predict_fold independent_variable_file
pred_dir = File.dirname independent_variable_file
predictions = []
File.readlines(independent_variable_file).each do |line|
variables = line.chomp.split(",")
smiles = variables.shift
variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
predictions << predict(smiles,variables)
end
File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} }
end
def predict_file independent_variable_file
predictions = []
File.readlines(independent_variable_file).each do |line|
variables = line.chomp.split(",")
variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
puts predict("",variables).join(",")
end
end
def predict_smiles smiles
c = Compound.from_smiles(smiles)
predict c.smiles, c.fingerprint
end
def predict smiles, variables
similarities = []
@independent_variables.each do |row|
if @independent_variable_type == "binary"
similarities << Similarity.tanimoto([row, variables])
elsif @independent_variable_type == "numeric"
similarities << Similarity.cosine([row, variables])
end
end
neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
experimental = @dependent_variables[@smiles.index(smiles)] if @smiles.include? smiles
return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2
neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
neighbor_similarities = neighbor_idx.collect{|i| similarities[i]}
probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities)
probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
#p neighbor_dependent_variables.join ","
#p neighbor_similarities.join ","
#p neighbor_idx.collect{|i| @smiles[i]}
[ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ]
end
# Weighted majority vote
# @param [Array<0,1>] dependent_variables
# @param [Array<Float>] weights
# @return [Array] probabilities
def weighted_majority_vote dependent_variables, weights
w = []
w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]}
w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]}
weights_sum = weights.sum.to_f
weights_max = weights.max.to_f
probabilities = []
probabilities[0] = weights_max*w[0].sum/weights_sum
probabilities[1] = weights_max*w[1].sum/weights_sum
probabilities
end
end
|