diff options
author | Christoph Helma <helma@in-silico.ch> | 2021-02-12 19:54:07 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2021-02-12 19:54:07 +0100 |
commit | a29eb3e38414cd252850c9c4fb356f8b2bef6fb4 (patch) | |
tree | a957d9ac455e7345c51f3ab6075698f552c497d1 /lib/model.rb | |
parent | 158e9a7ecbc467c3db77c354f203b1176b0fc3f2 (diff) |
model.rb refactored, mp2d models updated
Diffstat (limited to 'lib/model.rb')
-rw-r--r-- | lib/model.rb | 236 |
1 files changed, 144 insertions, 92 deletions
diff --git a/lib/model.rb b/lib/model.rb index 0e011c5..c1dcb4e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -1,26 +1,13 @@ +require 'matrix' + class Model def initialize dir @dir = dir - @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp} - @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp - if @dependent_variable_type == "binary" - abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"] - @dependent_variables = @dependent_variables.collect{|v| v.to_i} - elsif @dependent_variable_type == "numeric" - # TODO check for floats - @dependent_variables = @dependent_variables.collect{|v| v.to_f} - end - @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp - @independent_variables = [] - @smiles = [] - File.readlines(File.join(@dir,"independent_variables")).each do |line| - items = line.chomp.split(",") - @smiles << items.shift - items.collect!{|v| v.to_f} if @independent_variable_type == "numeric" - @independent_variables << items - end - @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f} + @similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f} + @smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ] + @dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ] + abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size end def crossvalidation folds=10 @@ -29,89 +16,124 @@ class Model indices = (0..nr_instances-1).to_a.shuffle mid = (nr_instances/folds) start = 0 + threads = [] 0.upto(folds-1) do |i| - t = Time.now - print "Fold #{i}: " - # split train data - last = start+mid - last = last-1 unless nr_instances%folds > i - test_idxs = indices[start..last] || [] - idxs = { - :test => test_idxs, - :train => indices-test_idxs - } - start = last+1 - # write training/test data - cv_dir = File.join(@dir,"crossvalidation",i.to_s) - dirs = {} - idxs.each do |t,idx| - d = File.join cv_dir,t.to_s - dirs[t] = d - FileUtils.mkdir_p d - File.open(File.join(d,"independent_variables"),"w+") do |f| - idx.each do |i| - f.print "#{@smiles[i]}," - f.puts @independent_variables[i].join(",") - end - end - File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} - if t == :train - File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type } - File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type } - File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } + threads << Thread.new do + t = Time.now + puts "Fold #{i} started" + # split train data + last = start+mid + last = last-1 unless nr_instances%folds > i + test_idxs = indices[start..last] || [] + idxs = { + :test => test_idxs, + :train => indices-test_idxs + } + start = last+1 + # write training/test data + cv_dir = File.join(@dir,"crossvalidation",i.to_s) + dirs = {} + idxs.each do |t,idx| + d = File.join cv_dir,t.to_s + dirs[t] = d + FileUtils.mkdir_p d + File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") } + File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") } + File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} + File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train end + # predict + train_model = self.class.new dirs[:train] + train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions") + puts "Fold #{i}: #{(Time.now-t)/60} min" + end + end + threads.each(&:join) + puts "Total: #{(Time.now-start_time)/60} min" + end + + def batch_predict dir, out=$stdout + prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp} + File.open(out, "w+") do |f| + File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i| + variables = line.chomp.split(",") + f.puts predict(prediction_smiles[i],variables).join(",") end - # predict - train_model = self.class.new dirs[:train] - train_model.predict_fold File.join(dirs[:test],"independent_variables") - puts Time.now-t end - puts "Total: #{Time.now-start_time}" end end -class ClassificationModel < Model +module Cosine - def predict_fold independent_variable_file - pred_dir = File.dirname independent_variable_file - predictions = [] - File.readlines(independent_variable_file).each do |line| - variables = line.chomp.split(",") - smiles = variables.shift - variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric" - predictions << predict(smiles,variables) + def preprocess + puts "Feature selection" + t = Time.now + @selected = (0..@independent_variables.first.size-1).to_a + columns = Matrix[ *@independent_variables ].column_vectors + columns.each_with_index do |c,i| + next unless @selected.include? i + p "#{i}/#{@selected.size}" + # remove variables with zero variances + if c.to_a.zero_variance? + @selected.delete i + next + end + # remove correlated variables + (i+1..columns.size-1).each do |j| + next unless @selected.include? j + @selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9 + end end - File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} } + @selected.sort! + p + mat = @selected.collect{|i| @independent_variables[i]} + columns = Matrix[ *mat ].column_vectors + @independent_variable_means = columns.collect{|c| c.to_a.mean} + @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} + scaled_columns = [] + columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}} + @scaled_independent_variables = Matrix.columns(scaled_columns).to_a + p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size + puts (Time.now-t)/60 end - def predict_file independent_variable_file - predictions = [] - File.readlines(independent_variable_file).each do |line| - variables = line.chomp.split(",") - variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric" - puts predict("",variables).join(",") - end + def predict smiles, variables + variables.collect!{|v| v.to_f} + preprocess unless @scaled_independent_variables # lazy preprocessing + selected_variables = @selected.collect{|i| variables[i]} + scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]} + similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])} + similarity_prediction smiles, similarities end +end + +module Tanimoto + def predict_smiles smiles c = Compound.from_smiles(smiles) - predict c.smiles, c.fingerprint + predict smiles, c.fingerprint end - - def predict smiles, variables - similarities = [] - @independent_variables.each do |row| - if @independent_variable_type == "binary" - similarities << Similarity.tanimoto([row, variables]) - elsif @independent_variable_type == "numeric" - similarities << Similarity.cosine([row, variables]) - end - end - neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} - neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold + def predict smiles, fingerprint + similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])} + similarity_prediction smiles, similarities + end +end + +class ClassificationModel < Model + + def initialize dir + super dir + abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"] + @dependent_variables = @dependent_variables.collect{|v| v.to_i} + end + + def similarity_prediction smiles, similarities + neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} + neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds - experimental = @dependent_variables[@smiles.index(smiles)] if @smiles.include? smiles + experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2 neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]} @@ -119,20 +141,17 @@ class ClassificationModel < Model probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities) probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 - #p neighbor_dependent_variables.join "," - #p neighbor_similarities.join "," - #p neighbor_idx.collect{|i| @smiles[i]} [ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ] end - + # Weighted majority vote - # @param [Array<0,1>] dependent_variables + # @param [Array<0,1>] neighbor_dependent_variables # @param [Array<Float>] weights # @return [Array] probabilities - def weighted_majority_vote dependent_variables, weights + def weighted_majority_vote neighbor_dependent_variables, weights w = [] - w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]} - w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]} + w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]} + w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]} weights_sum = weights.sum.to_f weights_max = weights.max.to_f probabilities = [] @@ -141,3 +160,36 @@ class ClassificationModel < Model probabilities end end + +class RegressionModel < Model +end + +class TanimotoClassificationModel < ClassificationModel + include Tanimoto + + def initialize dir + super dir + @independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ] + abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size + end +end + +class CosineClassificationModel < ClassificationModel + include Cosine + + def initialize dir + super dir + @independent_variables = Matrix[ + *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} } + ] + abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size + abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size + end + +end + +class TanimotoRegressionModel < RegressionModel +end + +class CosineRegressionModel < RegressionModel +end |