From 08e5768e9a446db8ab95152d2e9403a0e635ec63 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 8 Mar 2021 17:41:26 +0100 Subject: cdk predictions fixed --- lib/model.rb | 290 ++++++++++++++++++++++++++--------------------------------- 1 file changed, 126 insertions(+), 164 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index c1dcb4e..c4ca1f3 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -1,195 +1,157 @@ -require 'matrix' +class String + def numeric? + Float(self) != nil rescue false + end +end class Model - def initialize dir - @dir = dir - @similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f} - @smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ] - @dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ] - abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size - end + attr_reader :train, :dependent_variable_name, :independent_variable_names - def crossvalidation folds=10 - start_time = Time.now - nr_instances = @independent_variables.size - indices = (0..nr_instances-1).to_a.shuffle - mid = (nr_instances/folds) - start = 0 - threads = [] - 0.upto(folds-1) do |i| - threads << Thread.new do - t = Time.now - puts "Fold #{i} started" - # split train data - last = start+mid - last = last-1 unless nr_instances%folds > i - test_idxs = indices[start..last] || [] - idxs = { - :test => test_idxs, - :train => indices-test_idxs - } - start = last+1 - # write training/test data - cv_dir = File.join(@dir,"crossvalidation",i.to_s) - dirs = {} - idxs.each do |t,idx| - d = File.join cv_dir,t.to_s - dirs[t] = d - FileUtils.mkdir_p d - File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") } - File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") } - File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} - File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train - end - # predict - train_model = self.class.new dirs[:train] - train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions") - puts "Fold #{i}: #{(Time.now-t)/60} min" - end - end - threads.each(&:join) - puts "Total: #{(Time.now-start_time)/60} min" + def initialize file + puts "Reading training data from #{file}." + @train = File.readlines(file).collect{|l| l.chomp.split(",")} + header = @train.shift + @dependent_variable_name = header[1] + @independent_variable_names = header[2..-1] end - def batch_predict dir, out=$stdout - prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp} - File.open(out, "w+") do |f| - File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i| - variables = line.chomp.split(",") - f.puts predict(prediction_smiles[i],variables).join(",") - end + def model_type + puts "Determining model type." + if dependent_variables.uniq == ["1","0"] + @dependent_variable_type = "binary" + @train.each {|t| t[1] == "1" ? t[1] = true : t[1] = false } + elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true] + @dependent_variable_type = "numeric" + @train.each {|t| t[1] = t[1].to_f } + else + raise "Incorrect model dependent variables [#{dependent_variables.uniq}]. Must be either [0,1] or numeric." + end + if independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] + @independent_variable_type = "set" + elsif independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] + @independent_variable_type = "numeric" + @train.each {|t| t[2..-1] = t[2..-1].collect{|v| v = v.to_f}} + else + raise "Incorrect model independent variables [#{independent_variables.flatten.uniq}]. Must be either a set (fingerprints) or numeric." end end -end -module Cosine - - def preprocess - puts "Feature selection" - t = Time.now - @selected = (0..@independent_variables.first.size-1).to_a - columns = Matrix[ *@independent_variables ].column_vectors - columns.each_with_index do |c,i| - next unless @selected.include? i - p "#{i}/#{@selected.size}" - # remove variables with zero variances - if c.to_a.zero_variance? - @selected.delete i - next + def predict file +=begin + model_type + puts "Reading prediction data from #{file}." + @batch = File.readlines(file).collect{|l| l.chomp.split(",")} + header = @batch.shift + @batch_independent_variable_names = header[1..-1] + unless (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric") + raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}." + end + if @independent_variable_type == "numeric" + @batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}} + select(@independent_variable_names & @batch_independent_variable_names) + File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f| + f.print "CANSMI,dataset," + f.puts @independent_variable_names.join(",") + @train.each do |row| + f.puts ([row[0],"train"]+row[2..-1]).join(",") + end + @batch.each do |row| + f.puts ([row[0],"predict"]+row[1..-1]).join(",") + end end - # remove correlated variables - (i+1..columns.size-1).each do |j| - next unless @selected.include? j - @selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9 + puts "Feature selection and scaling." + puts `Rscript #{File.join(File.dirname(__FILE__),"..","bin","preprocessing.R")} #{File.join(File.dirname(file),"common-variables.csv")} #{File.join(File.dirname(file),"scaled-variables.csv")}` +=end + puts "Reading scaled features." + lines = File.readlines(File.join(File.dirname(file),"scaled-variables.csv")) + @independent_variable_names = @batch_independent_variable_names = lines.shift.chomp.split(",")[2..-1] + @scaled_train = [] + @scaled_batch = [] + lines.each_with_index do |line,i| + items = line.chomp.split(",") + if items[1] == "train" + items[1] = @train[i][1] + @scaled_train << items.collect{|i| i.to_s.numeric? ? i.to_f : i} + elsif items[1] == "predict" + items.delete_at(1) + @scaled_batch << items.collect{|i| i.to_s.numeric? ? i.to_f : i} + end end - end - @selected.sort! - p - mat = @selected.collect{|i| @independent_variables[i]} - columns = Matrix[ *mat ].column_vectors - @independent_variable_means = columns.collect{|c| c.to_a.mean} - @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} - scaled_columns = [] - columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}} - @scaled_independent_variables = Matrix.columns(scaled_columns).to_a - p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size - puts (Time.now-t)/60 + puts "Predicting #{file}." + File.open(file.sub(".csv","-prediction.csv"),"w+") do |f| + f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",") + @scaled_batch.each do |pred| + classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Distance.euclid([row[2..-1],pred[1..-1]])]}).each do |pred| + #classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred| + f.puts pred.join(",") + puts pred.join(",") + end + end + end + #end end - def predict smiles, variables - variables.collect!{|v| v.to_f} - preprocess unless @scaled_independent_variables # lazy preprocessing - selected_variables = @selected.collect{|i| variables[i]} - scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]} - similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])} - similarity_prediction smiles, similarities + def select variable_names + model_variable_idx = [0,1]+variable_names.collect{|n| @independent_variable_names.index(n)+2} + batch_variable_idx = [0]+variable_names.collect{|n| @batch_independent_variable_names.index(n)+1} + @train.collect!{|r| model_variable_idx.collect{|i| r[i]}} + @batch.collect!{|r| batch_variable_idx.collect{|i| r[i]}} + @independent_variable_names = variable_names + @batch_independent_variable_names = variable_names end -end - -module Tanimoto - - def predict_smiles smiles - c = Compound.from_smiles(smiles) - predict smiles, c.fingerprint + def classification smiles, train + experimental = train.select{|row| row[0] == smiles} + train = train-experimental + #train.select!{|row| row[2] > 0.8} + #train.select!{|row| row[2] > 0.5} if train.size < 2 + #train.select!{|row| row[2] > 0.5} + #train.select!{|row| row[2] > -1.0} if train.size < 2 + puts "==" + puts smiles + puts train.sort_by{|r| r[2]}[0..10].collect{|r| r.join(",")}.join("\n") + #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n") + puts "--" + if train.size < 2 + classification = nil + probabilities = [nil,nil] + else + probabilities = weighted_majority_vote(train) + probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 + end + experimental = [[nil,nil,nil]] if experimental.empty? + experimental.collect do + [ smiles, experimental[1], classification ] + probabilities + [ train.collect{|t| t[2]}.max, train.size ] + end end - def predict smiles, fingerprint - similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])} - similarity_prediction smiles, similarities + def weighted_majority_vote neighbors + w = [neighbors.select{|n| n[1] == 0}.collect{|n| n[2]}, neighbors.select{|n| n[1] == 1}.collect{|n| n[2]}] + weights_sum = neighbors.collect{|n| n[2]}.sum.to_f + weights_max = neighbors.collect{|n| n[2]}.max.to_f + [weights_max*w[0].sum/weights_sum, weights_max*w[1].sum/weights_sum] end -end -class ClassificationModel < Model - def initialize dir - super dir - abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"] - @dependent_variables = @dependent_variables.collect{|v| v.to_i} + def smiles + @train.collect{|t| t[0]} end - - def similarity_prediction smiles, similarities - neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} - neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold - neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds - experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles - return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2 - neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]} - neighbor_similarities = neighbor_idx.collect{|i| similarities[i]} - probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities) - probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 - - [ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ] + def dependent_variables + @train.collect{|t| t[1]} end - # Weighted majority vote - # @param [Array<0,1>] neighbor_dependent_variables - # @param [Array] weights - # @return [Array] probabilities - def weighted_majority_vote neighbor_dependent_variables, weights - w = [] - w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]} - w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]} - weights_sum = weights.sum.to_f - weights_max = weights.max.to_f - probabilities = [] - probabilities[0] = weights_max*w[0].sum/weights_sum - probabilities[1] = weights_max*w[1].sum/weights_sum - probabilities + def independent_variables + @train.collect{|t| t[2..-1]} end -end - -class RegressionModel < Model -end - -class TanimotoClassificationModel < ClassificationModel - include Tanimoto - def initialize dir - super dir - @independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ] - abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size + def batch_smiles + @batch.collect{|t| t[0]} end -end - -class CosineClassificationModel < ClassificationModel - include Cosine - def initialize dir - super dir - @independent_variables = Matrix[ - *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} } - ] - abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size - abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size + def batch_independent_variables + @batch.collect{|t| t[1..-1]} end end - -class TanimotoRegressionModel < RegressionModel -end - -class CosineRegressionModel < RegressionModel -end -- cgit v1.2.3