summaryrefslogtreecommitdiff
path: root/lib/model.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2021-02-12 19:54:07 +0100
committerChristoph Helma <helma@in-silico.ch>2021-02-12 19:54:07 +0100
commita29eb3e38414cd252850c9c4fb356f8b2bef6fb4 (patch)
treea957d9ac455e7345c51f3ab6075698f552c497d1 /lib/model.rb
parent158e9a7ecbc467c3db77c354f203b1176b0fc3f2 (diff)
model.rb refactored, mp2d models updated
Diffstat (limited to 'lib/model.rb')
-rw-r--r--lib/model.rb236
1 files changed, 144 insertions, 92 deletions
diff --git a/lib/model.rb b/lib/model.rb
index 0e011c5..c1dcb4e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -1,26 +1,13 @@
+require 'matrix'
+
class Model
def initialize dir
@dir = dir
- @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp}
- @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp
- if @dependent_variable_type == "binary"
- abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"]
- @dependent_variables = @dependent_variables.collect{|v| v.to_i}
- elsif @dependent_variable_type == "numeric"
- # TODO check for floats
- @dependent_variables = @dependent_variables.collect{|v| v.to_f}
- end
- @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp
- @independent_variables = []
- @smiles = []
- File.readlines(File.join(@dir,"independent_variables")).each do |line|
- items = line.chomp.split(",")
- @smiles << items.shift
- items.collect!{|v| v.to_f} if @independent_variable_type == "numeric"
- @independent_variables << items
- end
- @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f}
+ @similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f}
+ @smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ]
+ @dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ]
+ abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size
end
def crossvalidation folds=10
@@ -29,89 +16,124 @@ class Model
indices = (0..nr_instances-1).to_a.shuffle
mid = (nr_instances/folds)
start = 0
+ threads = []
0.upto(folds-1) do |i|
- t = Time.now
- print "Fold #{i}: "
- # split train data
- last = start+mid
- last = last-1 unless nr_instances%folds > i
- test_idxs = indices[start..last] || []
- idxs = {
- :test => test_idxs,
- :train => indices-test_idxs
- }
- start = last+1
- # write training/test data
- cv_dir = File.join(@dir,"crossvalidation",i.to_s)
- dirs = {}
- idxs.each do |t,idx|
- d = File.join cv_dir,t.to_s
- dirs[t] = d
- FileUtils.mkdir_p d
- File.open(File.join(d,"independent_variables"),"w+") do |f|
- idx.each do |i|
- f.print "#{@smiles[i]},"
- f.puts @independent_variables[i].join(",")
- end
- end
- File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
- if t == :train
- File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type }
- File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type }
- File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") }
+ threads << Thread.new do
+ t = Time.now
+ puts "Fold #{i} started"
+ # split train data
+ last = start+mid
+ last = last-1 unless nr_instances%folds > i
+ test_idxs = indices[start..last] || []
+ idxs = {
+ :test => test_idxs,
+ :train => indices-test_idxs
+ }
+ start = last+1
+ # write training/test data
+ cv_dir = File.join(@dir,"crossvalidation",i.to_s)
+ dirs = {}
+ idxs.each do |t,idx|
+ d = File.join cv_dir,t.to_s
+ dirs[t] = d
+ FileUtils.mkdir_p d
+ File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") }
+ File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") }
+ File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
+ File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train
end
+ # predict
+ train_model = self.class.new dirs[:train]
+ train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions")
+ puts "Fold #{i}: #{(Time.now-t)/60} min"
+ end
+ end
+ threads.each(&:join)
+ puts "Total: #{(Time.now-start_time)/60} min"
+ end
+
+ def batch_predict dir, out=$stdout
+ prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp}
+ File.open(out, "w+") do |f|
+ File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i|
+ variables = line.chomp.split(",")
+ f.puts predict(prediction_smiles[i],variables).join(",")
end
- # predict
- train_model = self.class.new dirs[:train]
- train_model.predict_fold File.join(dirs[:test],"independent_variables")
- puts Time.now-t
end
- puts "Total: #{Time.now-start_time}"
end
end
-class ClassificationModel < Model
+module Cosine
- def predict_fold independent_variable_file
- pred_dir = File.dirname independent_variable_file
- predictions = []
- File.readlines(independent_variable_file).each do |line|
- variables = line.chomp.split(",")
- smiles = variables.shift
- variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
- predictions << predict(smiles,variables)
+ def preprocess
+ puts "Feature selection"
+ t = Time.now
+ @selected = (0..@independent_variables.first.size-1).to_a
+ columns = Matrix[ *@independent_variables ].column_vectors
+ columns.each_with_index do |c,i|
+ next unless @selected.include? i
+ p "#{i}/#{@selected.size}"
+ # remove variables with zero variances
+ if c.to_a.zero_variance?
+ @selected.delete i
+ next
+ end
+ # remove correlated variables
+ (i+1..columns.size-1).each do |j|
+ next unless @selected.include? j
+ @selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9
+ end
end
- File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} }
+ @selected.sort!
+ p
+ mat = @selected.collect{|i| @independent_variables[i]}
+ columns = Matrix[ *mat ].column_vectors
+ @independent_variable_means = columns.collect{|c| c.to_a.mean}
+ @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation}
+ scaled_columns = []
+ columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}}
+ @scaled_independent_variables = Matrix.columns(scaled_columns).to_a
+ p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size
+ puts (Time.now-t)/60
end
- def predict_file independent_variable_file
- predictions = []
- File.readlines(independent_variable_file).each do |line|
- variables = line.chomp.split(",")
- variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
- puts predict("",variables).join(",")
- end
+ def predict smiles, variables
+ variables.collect!{|v| v.to_f}
+ preprocess unless @scaled_independent_variables # lazy preprocessing
+ selected_variables = @selected.collect{|i| variables[i]}
+ scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]}
+ similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])}
+ similarity_prediction smiles, similarities
end
+end
+
+module Tanimoto
+
def predict_smiles smiles
c = Compound.from_smiles(smiles)
- predict c.smiles, c.fingerprint
+ predict smiles, c.fingerprint
end
-
- def predict smiles, variables
- similarities = []
- @independent_variables.each do |row|
- if @independent_variable_type == "binary"
- similarities << Similarity.tanimoto([row, variables])
- elsif @independent_variable_type == "numeric"
- similarities << Similarity.cosine([row, variables])
- end
- end
- neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
- neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
+ def predict smiles, fingerprint
+ similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])}
+ similarity_prediction smiles, similarities
+ end
+end
+
+class ClassificationModel < Model
+
+ def initialize dir
+ super dir
+ abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"]
+ @dependent_variables = @dependent_variables.collect{|v| v.to_i}
+ end
+
+ def similarity_prediction smiles, similarities
+ neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
+ neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
- experimental = @dependent_variables[@smiles.index(smiles)] if @smiles.include? smiles
+ experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles
return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2
neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
@@ -119,20 +141,17 @@ class ClassificationModel < Model
probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities)
probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
- #p neighbor_dependent_variables.join ","
- #p neighbor_similarities.join ","
- #p neighbor_idx.collect{|i| @smiles[i]}
[ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ]
end
-
+
# Weighted majority vote
- # @param [Array<0,1>] dependent_variables
+ # @param [Array<0,1>] neighbor_dependent_variables
# @param [Array<Float>] weights
# @return [Array] probabilities
- def weighted_majority_vote dependent_variables, weights
+ def weighted_majority_vote neighbor_dependent_variables, weights
w = []
- w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]}
- w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]}
+ w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]}
+ w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]}
weights_sum = weights.sum.to_f
weights_max = weights.max.to_f
probabilities = []
@@ -141,3 +160,36 @@ class ClassificationModel < Model
probabilities
end
end
+
+class RegressionModel < Model
+end
+
+class TanimotoClassificationModel < ClassificationModel
+ include Tanimoto
+
+ def initialize dir
+ super dir
+ @independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ]
+ abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size
+ end
+end
+
+class CosineClassificationModel < ClassificationModel
+ include Cosine
+
+ def initialize dir
+ super dir
+ @independent_variables = Matrix[
+ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} }
+ ]
+ abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size
+ abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size
+ end
+
+end
+
+class TanimotoRegressionModel < RegressionModel
+end
+
+class CosineRegressionModel < RegressionModel
+end