require 'matrix' class Model def initialize dir @dir = dir @similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f} @smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ] @dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ] abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size end def crossvalidation folds=10 start_time = Time.now nr_instances = @independent_variables.size indices = (0..nr_instances-1).to_a.shuffle mid = (nr_instances/folds) start = 0 threads = [] 0.upto(folds-1) do |i| threads << Thread.new do t = Time.now puts "Fold #{i} started" # split train data last = start+mid last = last-1 unless nr_instances%folds > i test_idxs = indices[start..last] || [] idxs = { :test => test_idxs, :train => indices-test_idxs } start = last+1 # write training/test data cv_dir = File.join(@dir,"crossvalidation",i.to_s) dirs = {} idxs.each do |t,idx| d = File.join cv_dir,t.to_s dirs[t] = d FileUtils.mkdir_p d File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") } File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") } File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train end # predict train_model = self.class.new dirs[:train] train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions") puts "Fold #{i}: #{(Time.now-t)/60} min" end end threads.each(&:join) puts "Total: #{(Time.now-start_time)/60} min" end def batch_predict dir, out=$stdout prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp} File.open(out, "w+") do |f| File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i| variables = line.chomp.split(",") f.puts predict(prediction_smiles[i],variables).join(",") end end end end module Cosine def preprocess puts "Feature selection" t = Time.now @selected = (0..@independent_variables.first.size-1).to_a columns = Matrix[ *@independent_variables ].column_vectors columns.each_with_index do |c,i| next unless @selected.include? i p "#{i}/#{@selected.size}" # remove variables with zero variances if c.to_a.zero_variance? @selected.delete i next end # remove correlated variables (i+1..columns.size-1).each do |j| next unless @selected.include? j @selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9 end end @selected.sort! p mat = @selected.collect{|i| @independent_variables[i]} columns = Matrix[ *mat ].column_vectors @independent_variable_means = columns.collect{|c| c.to_a.mean} @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} scaled_columns = [] columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}} @scaled_independent_variables = Matrix.columns(scaled_columns).to_a p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size puts (Time.now-t)/60 end def predict smiles, variables variables.collect!{|v| v.to_f} preprocess unless @scaled_independent_variables # lazy preprocessing selected_variables = @selected.collect{|i| variables[i]} scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]} similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])} similarity_prediction smiles, similarities end end module Tanimoto def predict_smiles smiles c = Compound.from_smiles(smiles) predict smiles, c.fingerprint end def predict smiles, fingerprint similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])} similarity_prediction smiles, similarities end end class ClassificationModel < Model def initialize dir super dir abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"] @dependent_variables = @dependent_variables.collect{|v| v.to_i} end def similarity_prediction smiles, similarities neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2 neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]} neighbor_similarities = neighbor_idx.collect{|i| similarities[i]} probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities) probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 [ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ] end # Weighted majority vote # @param [Array<0,1>] neighbor_dependent_variables # @param [Array] weights # @return [Array] probabilities def weighted_majority_vote neighbor_dependent_variables, weights w = [] w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]} w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]} weights_sum = weights.sum.to_f weights_max = weights.max.to_f probabilities = [] probabilities[0] = weights_max*w[0].sum/weights_sum probabilities[1] = weights_max*w[1].sum/weights_sum probabilities end end class RegressionModel < Model end class TanimotoClassificationModel < ClassificationModel include Tanimoto def initialize dir super dir @independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ] abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size end end class CosineClassificationModel < ClassificationModel include Cosine def initialize dir super dir @independent_variables = Matrix[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} } ] abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size end end class TanimotoRegressionModel < RegressionModel end class CosineRegressionModel < RegressionModel end