class String def numeric? Float(self) != nil rescue false end end class Model attr_reader :train, :dependent_variable_name, :independent_variable_names def initialize file puts "Reading training data from #{file}." @train = File.readlines(file).collect{|l| l.chomp.split(",")} header = @train.shift @dependent_variable_name = header[1] @independent_variable_names = header[2..-1] end def model_type puts "Determining model type." if dependent_variables.uniq == ["1","0"] @dependent_variable_type = "binary" @train.each {|t| t[1] == t[1].to_i} elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true] @dependent_variable_type = "numeric" @train.each {|t| t[1] = t[1].to_f } else raise "Incorrect model dependent variables [#{dependent_variables.uniq}]. Must be either [0,1] or numeric." end if independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] @independent_variable_type = "set" elsif independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] @independent_variable_type = "numeric" @train.each {|t| t[2..-1] = t[2..-1].collect{|v| v = v.to_f}} else raise "Incorrect model independent variables [#{independent_variables.flatten.uniq}]. Must be either a set (fingerprints) or numeric." end end def predict file model_type puts "Reading prediction data from #{file}." @batch = File.readlines(file).collect{|l| l.chomp.split(",")} header = @batch.shift @batch_independent_variable_names = header[1..-1] unless (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric") raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}." end if @independent_variable_type == "numeric" @minsim = [0.9,0.7] @batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}} select(@independent_variable_names & @batch_independent_variable_names) File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f| f.print "CANSMI,dataset," f.puts @independent_variable_names.join(",") @train.each do |row| f.puts ([row[0],"train"]+row[2..-1]).join(",") end @batch.each do |row| f.puts ([row[0],"predict"]+row[1..-1]).join(",") end end puts "Feature selection and scaling." puts `Rscript #{File.join(File.dirname(__FILE__),"..","bin","preprocessing.R")} #{File.join(File.dirname(file),"common-variables.csv")} #{File.join(File.dirname(file),"scaled-variables.csv")}` puts "Reading scaled features." lines = File.readlines(File.join(File.dirname(file),"scaled-variables.csv")) @independent_variable_names = @batch_independent_variable_names = lines.shift.chomp.split(",")[2..-1] @scaled_train = [] @scaled_batch = [] lines.each_with_index do |line,i| items = line.chomp.split(",") if items[1] == "train" items[1] = @train[i][1] @scaled_train << items.collect{|i| i.to_s.numeric? ? i.to_f : i} elsif items[1] == "predict" items.delete_at(1) @scaled_batch << items.collect{|i| i.to_s.numeric? ? i.to_f : i} end end puts "Predicting #{file}." File.open(file.sub(".csv","-prediction.csv"),"w+") do |f| f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",") @scaled_batch.each do |pred| classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred| f.puts pred.join(",") #puts pred.join(",") end end end elsif @independent_variable_type == "set" @minsim = [0.5,0.2] end end def select variable_names model_variable_idx = [0,1]+variable_names.collect{|n| @independent_variable_names.index(n)+2} batch_variable_idx = [0]+variable_names.collect{|n| @batch_independent_variable_names.index(n)+1} @train.collect!{|r| model_variable_idx.collect{|i| r[i]}} @batch.collect!{|r| batch_variable_idx.collect{|i| r[i]}} @independent_variable_names = variable_names @batch_independent_variable_names = variable_names end def classification smiles, train experimental = train.select{|row| row[0] == smiles} train = train-experimental n = train.select{|row| row[2] > @minsim[0]} n = train.select!{|row| row[2] > @minsim[1]} if n.size < 2 train = n #puts train.sort_by{|r| r[2]}[0..5].collect{|r| r.join(",")}.join("\n") #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n") if train.size < 2 classification = nil probabilities = [nil,nil] else probabilities = weighted_majority_vote(train) probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 end experimental = [[nil,nil,nil]] if experimental.empty? experimental.collect do [ smiles, experimental[1], classification ] + probabilities + [ train.collect{|t| t[2]}.max, train.size ] end end def weighted_majority_vote neighbors w = [neighbors.select{|n| n[1] == 0}.collect{|n| n[2]}, neighbors.select{|n| n[1] == 1}.collect{|n| n[2]}] weights_sum = neighbors.collect{|n| n[2]}.sum.to_f weights_max = neighbors.collect{|n| n[2]}.max.to_f [weights_max*w[0].sum/weights_sum, weights_max*w[1].sum/weights_sum] end def smiles @train.collect{|t| t[0]} end def dependent_variables @train.collect{|t| t[1]} end def independent_variables @train.collect{|t| t[2..-1]} end def batch_smiles @batch.collect{|t| t[0]} end def batch_independent_variables @batch.collect{|t| t[1..-1]} end end