1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
class Model
def initialize dir
@dir = dir
@dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp}
@dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp
if @dependent_variable_type == "binary"
abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"]
@dependent_variables = @dependent_variables.collect{|v| v.to_i}
elsif @dependent_variable_type == "numeric"
# TODO check for floats
@dependent_variables = @dependent_variables.collect{|v| v.to_f}
end
@independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp
@independent_variables = []
@smiles = []
File.readlines(File.join(@dir,"independent_variables")).each do |line|
items = line.chomp.split(",")
@smiles << items.shift
items.collect!{|v| v.to_f} if @independent_variable_type == "numeric"
@independent_variables << items
end
@similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f}
end
def crossvalidation folds=10
start_time = Time.now
nr_instances = @independent_variables.size
indices = (0..nr_instances-1).to_a.shuffle
mid = (nr_instances/folds)
start = 0
0.upto(folds-1) do |i|
t = Time.now
print "Fold #{i}: "
# split train data
last = start+mid
last = last-1 unless nr_instances%folds > i
test_idxs = indices[start..last] || []
idxs = {
:test => test_idxs,
:train => indices-test_idxs
}
start = last+1
# write training/test data
cv_dir = File.join(@dir,"crossvalidation",i.to_s)
dirs = {}
idxs.each do |t,idx|
d = File.join cv_dir,t.to_s
dirs[t] = d
FileUtils.mkdir_p d
File.open(File.join(d,"independent_variables"),"w+") do |f|
idx.each do |i|
f.print "#{@smiles[i]},"
f.puts @independent_variables[i].join(",")
end
end
File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
if t == :train
File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type }
File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type }
File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") }
end
end
# predict
train_model = self.class.new dirs[:train]
train_model.predict_file File.join(dirs[:test],"independent_variables")
puts Time.now-t
end
puts "Total: #{Time.now-start_time}"
end
end
class ClassificationModel < Model
def predict_file independent_variable_file
pred_dir = File.dirname independent_variable_file
predictions = []
File.readlines(independent_variable_file).each do |line|
variables = line.chomp.split(",")
smiles = variables.shift
variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
predictions << predict(smiles,variables)
end
File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} }
end
# TODO: with neighbors
def predict_smiles smiles
end
def predict smiles, variables
similarities = []
@independent_variables.each do |row|
if @independent_variable_type == "binary"
similarities << Similarity.tanimoto([row, variables])
elsif @independent_variable_type == "numeric"
similarities << Similarity.cosine([row, variables])
end
end
neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
return [smiles,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2
neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
neighbor_weights = neighbor_idx.collect{|i| similarities[i]}
probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_weights)
probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
[ smiles, classification ] + probabilities + [ similarities.max, neighbor_idx.size ]
end
# Weighted majority vote
# @param [Array<0,1>] dependent_variables
# @param [Array<Float>] weights
# @return [Array] probabilities
def weighted_majority_vote dependent_variables, weights
w = []
w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]}
w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]}
weights_sum = weights.sum.to_f
weights_max = weights.max.to_f
probabilities = []
probabilities[0] = weights_max*w[0].sum/weights_sum
probabilities[1] = weights_max*w[1].sum/weights_sum
probabilities
end
end
|