summaryrefslogtreecommitdiff
path: root/lib/model.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2021-03-08 17:41:26 +0100
committerChristoph Helma <helma@in-silico.ch>2021-03-08 17:41:26 +0100
commit08e5768e9a446db8ab95152d2e9403a0e635ec63 (patch)
tree6f4486c6bfd84b69febcb9d3a4d9de8fee1b1a26 /lib/model.rb
parenta29eb3e38414cd252850c9c4fb356f8b2bef6fb4 (diff)
cdk predictions fixed
Diffstat (limited to 'lib/model.rb')
-rw-r--r--lib/model.rb290
1 files changed, 126 insertions, 164 deletions
diff --git a/lib/model.rb b/lib/model.rb
index c1dcb4e..c4ca1f3 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -1,195 +1,157 @@
-require 'matrix'
+class String
+ def numeric?
+ Float(self) != nil rescue false
+ end
+end
class Model
- def initialize dir
- @dir = dir
- @similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f}
- @smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ]
- @dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ]
- abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size
- end
+ attr_reader :train, :dependent_variable_name, :independent_variable_names
- def crossvalidation folds=10
- start_time = Time.now
- nr_instances = @independent_variables.size
- indices = (0..nr_instances-1).to_a.shuffle
- mid = (nr_instances/folds)
- start = 0
- threads = []
- 0.upto(folds-1) do |i|
- threads << Thread.new do
- t = Time.now
- puts "Fold #{i} started"
- # split train data
- last = start+mid
- last = last-1 unless nr_instances%folds > i
- test_idxs = indices[start..last] || []
- idxs = {
- :test => test_idxs,
- :train => indices-test_idxs
- }
- start = last+1
- # write training/test data
- cv_dir = File.join(@dir,"crossvalidation",i.to_s)
- dirs = {}
- idxs.each do |t,idx|
- d = File.join cv_dir,t.to_s
- dirs[t] = d
- FileUtils.mkdir_p d
- File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") }
- File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") }
- File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
- File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train
- end
- # predict
- train_model = self.class.new dirs[:train]
- train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions")
- puts "Fold #{i}: #{(Time.now-t)/60} min"
- end
- end
- threads.each(&:join)
- puts "Total: #{(Time.now-start_time)/60} min"
+ def initialize file
+ puts "Reading training data from #{file}."
+ @train = File.readlines(file).collect{|l| l.chomp.split(",")}
+ header = @train.shift
+ @dependent_variable_name = header[1]
+ @independent_variable_names = header[2..-1]
end
- def batch_predict dir, out=$stdout
- prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp}
- File.open(out, "w+") do |f|
- File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i|
- variables = line.chomp.split(",")
- f.puts predict(prediction_smiles[i],variables).join(",")
- end
+ def model_type
+ puts "Determining model type."
+ if dependent_variables.uniq == ["1","0"]
+ @dependent_variable_type = "binary"
+ @train.each {|t| t[1] == "1" ? t[1] = true : t[1] = false }
+ elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true]
+ @dependent_variable_type = "numeric"
+ @train.each {|t| t[1] = t[1].to_f }
+ else
+ raise "Incorrect model dependent variables [#{dependent_variables.uniq}]. Must be either [0,1] or numeric."
+ end
+ if independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false]
+ @independent_variable_type = "set"
+ elsif independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true]
+ @independent_variable_type = "numeric"
+ @train.each {|t| t[2..-1] = t[2..-1].collect{|v| v = v.to_f}}
+ else
+ raise "Incorrect model independent variables [#{independent_variables.flatten.uniq}]. Must be either a set (fingerprints) or numeric."
end
end
-end
-module Cosine
-
- def preprocess
- puts "Feature selection"
- t = Time.now
- @selected = (0..@independent_variables.first.size-1).to_a
- columns = Matrix[ *@independent_variables ].column_vectors
- columns.each_with_index do |c,i|
- next unless @selected.include? i
- p "#{i}/#{@selected.size}"
- # remove variables with zero variances
- if c.to_a.zero_variance?
- @selected.delete i
- next
+ def predict file
+=begin
+ model_type
+ puts "Reading prediction data from #{file}."
+ @batch = File.readlines(file).collect{|l| l.chomp.split(",")}
+ header = @batch.shift
+ @batch_independent_variable_names = header[1..-1]
+ unless (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric")
+ raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}."
+ end
+ if @independent_variable_type == "numeric"
+ @batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}}
+ select(@independent_variable_names & @batch_independent_variable_names)
+ File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f|
+ f.print "CANSMI,dataset,"
+ f.puts @independent_variable_names.join(",")
+ @train.each do |row|
+ f.puts ([row[0],"train"]+row[2..-1]).join(",")
+ end
+ @batch.each do |row|
+ f.puts ([row[0],"predict"]+row[1..-1]).join(",")
+ end
end
- # remove correlated variables
- (i+1..columns.size-1).each do |j|
- next unless @selected.include? j
- @selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9
+ puts "Feature selection and scaling."
+ puts `Rscript #{File.join(File.dirname(__FILE__),"..","bin","preprocessing.R")} #{File.join(File.dirname(file),"common-variables.csv")} #{File.join(File.dirname(file),"scaled-variables.csv")}`
+=end
+ puts "Reading scaled features."
+ lines = File.readlines(File.join(File.dirname(file),"scaled-variables.csv"))
+ @independent_variable_names = @batch_independent_variable_names = lines.shift.chomp.split(",")[2..-1]
+ @scaled_train = []
+ @scaled_batch = []
+ lines.each_with_index do |line,i|
+ items = line.chomp.split(",")
+ if items[1] == "train"
+ items[1] = @train[i][1]
+ @scaled_train << items.collect{|i| i.to_s.numeric? ? i.to_f : i}
+ elsif items[1] == "predict"
+ items.delete_at(1)
+ @scaled_batch << items.collect{|i| i.to_s.numeric? ? i.to_f : i}
+ end
end
- end
- @selected.sort!
- p
- mat = @selected.collect{|i| @independent_variables[i]}
- columns = Matrix[ *mat ].column_vectors
- @independent_variable_means = columns.collect{|c| c.to_a.mean}
- @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation}
- scaled_columns = []
- columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}}
- @scaled_independent_variables = Matrix.columns(scaled_columns).to_a
- p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size
- puts (Time.now-t)/60
+ puts "Predicting #{file}."
+ File.open(file.sub(".csv","-prediction.csv"),"w+") do |f|
+ f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",")
+ @scaled_batch.each do |pred|
+ classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Distance.euclid([row[2..-1],pred[1..-1]])]}).each do |pred|
+ #classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
+ f.puts pred.join(",")
+ puts pred.join(",")
+ end
+ end
+ end
+ #end
end
- def predict smiles, variables
- variables.collect!{|v| v.to_f}
- preprocess unless @scaled_independent_variables # lazy preprocessing
- selected_variables = @selected.collect{|i| variables[i]}
- scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]}
- similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])}
- similarity_prediction smiles, similarities
+ def select variable_names
+ model_variable_idx = [0,1]+variable_names.collect{|n| @independent_variable_names.index(n)+2}
+ batch_variable_idx = [0]+variable_names.collect{|n| @batch_independent_variable_names.index(n)+1}
+ @train.collect!{|r| model_variable_idx.collect{|i| r[i]}}
+ @batch.collect!{|r| batch_variable_idx.collect{|i| r[i]}}
+ @independent_variable_names = variable_names
+ @batch_independent_variable_names = variable_names
end
-end
-
-module Tanimoto
-
- def predict_smiles smiles
- c = Compound.from_smiles(smiles)
- predict smiles, c.fingerprint
+ def classification smiles, train
+ experimental = train.select{|row| row[0] == smiles}
+ train = train-experimental
+ #train.select!{|row| row[2] > 0.8}
+ #train.select!{|row| row[2] > 0.5} if train.size < 2
+ #train.select!{|row| row[2] > 0.5}
+ #train.select!{|row| row[2] > -1.0} if train.size < 2
+ puts "=="
+ puts smiles
+ puts train.sort_by{|r| r[2]}[0..10].collect{|r| r.join(",")}.join("\n")
+ #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n")
+ puts "--"
+ if train.size < 2
+ classification = nil
+ probabilities = [nil,nil]
+ else
+ probabilities = weighted_majority_vote(train)
+ probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
+ end
+ experimental = [[nil,nil,nil]] if experimental.empty?
+ experimental.collect do
+ [ smiles, experimental[1], classification ] + probabilities + [ train.collect{|t| t[2]}.max, train.size ]
+ end
end
- def predict smiles, fingerprint
- similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])}
- similarity_prediction smiles, similarities
+ def weighted_majority_vote neighbors
+ w = [neighbors.select{|n| n[1] == 0}.collect{|n| n[2]}, neighbors.select{|n| n[1] == 1}.collect{|n| n[2]}]
+ weights_sum = neighbors.collect{|n| n[2]}.sum.to_f
+ weights_max = neighbors.collect{|n| n[2]}.max.to_f
+ [weights_max*w[0].sum/weights_sum, weights_max*w[1].sum/weights_sum]
end
-end
-class ClassificationModel < Model
- def initialize dir
- super dir
- abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"]
- @dependent_variables = @dependent_variables.collect{|v| v.to_i}
+ def smiles
+ @train.collect{|t| t[0]}
end
-
- def similarity_prediction smiles, similarities
- neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
- neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
- neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
- experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles
- return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2
- neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
- neighbor_similarities = neighbor_idx.collect{|i| similarities[i]}
- probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities)
- probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
-
- [ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ]
+ def dependent_variables
+ @train.collect{|t| t[1]}
end
- # Weighted majority vote
- # @param [Array<0,1>] neighbor_dependent_variables
- # @param [Array<Float>] weights
- # @return [Array] probabilities
- def weighted_majority_vote neighbor_dependent_variables, weights
- w = []
- w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]}
- w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]}
- weights_sum = weights.sum.to_f
- weights_max = weights.max.to_f
- probabilities = []
- probabilities[0] = weights_max*w[0].sum/weights_sum
- probabilities[1] = weights_max*w[1].sum/weights_sum
- probabilities
+ def independent_variables
+ @train.collect{|t| t[2..-1]}
end
-end
-
-class RegressionModel < Model
-end
-
-class TanimotoClassificationModel < ClassificationModel
- include Tanimoto
- def initialize dir
- super dir
- @independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ]
- abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size
+ def batch_smiles
+ @batch.collect{|t| t[0]}
end
-end
-
-class CosineClassificationModel < ClassificationModel
- include Cosine
- def initialize dir
- super dir
- @independent_variables = Matrix[
- *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} }
- ]
- abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size
- abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size
+ def batch_independent_variables
+ @batch.collect{|t| t[1..-1]}
end
end
-
-class TanimotoRegressionModel < RegressionModel
-end
-
-class CosineRegressionModel < RegressionModel
-end