From 83591831c6e36c36d87159acba6afdfedab95522 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 18 Mar 2021 16:48:36 +0100 Subject: fingerprint predictions added --- lib/model.rb | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index d62d889..1726690 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -6,7 +6,7 @@ end class Model - attr_reader :train, :dependent_variable_name, :independent_variable_names + attr_reader :train, :dependent_variable_name, :independent_variable_names, :minsim def initialize file puts "Reading training data from #{file}." @@ -14,13 +14,14 @@ class Model header = @train.shift @dependent_variable_name = header[1] @independent_variable_names = header[2..-1] + model_type end def model_type puts "Determining model type." - if dependent_variables.uniq == ["1","0"] + if dependent_variables.uniq.sort == ["0","1"] @dependent_variable_type = "binary" - @train.each {|t| t[1] == t[1].to_i} + @train.each {|t| t[1] = t[1].to_i} elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true] @dependent_variable_type = "numeric" @train.each {|t| t[1] = t[1].to_f } @@ -29,25 +30,25 @@ class Model end if independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] @independent_variable_type = "set" + @minsim = [0.5,0.2] elsif independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] @independent_variable_type = "numeric" @train.each {|t| t[2..-1] = t[2..-1].collect{|v| v = v.to_f}} + @minsim = [0.9,0.7] else raise "Incorrect model independent variables [#{independent_variables.flatten.uniq}]. Must be either a set (fingerprints) or numeric." end end def predict file - model_type puts "Reading prediction data from #{file}." @batch = File.readlines(file).collect{|l| l.chomp.split(",")} header = @batch.shift - @batch_independent_variable_names = header[1..-1] - unless (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric") + unless (@batch.collect{|b| b[1..-1]}.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric") raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}." end if @independent_variable_type == "numeric" - @minsim = [0.9,0.7] + @batch_independent_variable_names = header[1..-1] @batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}} select(@independent_variable_names & @batch_independent_variable_names) File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f| @@ -81,14 +82,22 @@ class Model File.open(file.sub(".csv","-prediction.csv"),"w+") do |f| f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",") @scaled_batch.each do |pred| - classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred| + classification(pred[0], @scaled_train.collect{|row| row[0..1] + [cosine([row[2..-1],pred[1..-1]])]}).each do |pred| f.puts pred.join(",") - #puts pred.join(",") end end end elsif @independent_variable_type == "set" - @minsim = [0.5,0.2] + puts "Predicting #{file}." + File.open(file.sub(".csv","-prediction.csv"),"w+") do |f| + f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",") + @batch.each do |fingerprints| + smi = fingerprints.shift + classification(smi, @train.collect{|row| row[0..1] + [tanimoto([row[2..-1],fingerprints])]}).each do |pred| + f.puts pred.join(",") + end + end + end end end @@ -107,8 +116,7 @@ class Model n = train.select{|row| row[2] > @minsim[0]} n = train.select!{|row| row[2] > @minsim[1]} if n.size < 2 train = n - #puts train.sort_by{|r| r[2]}[0..5].collect{|r| r.join(",")}.join("\n") - #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n") + #p train.sort_by{|r| r[2]}[0..5]#.collect{|r| r.join(",")}.join("\n") if train.size < 2 classification = nil probabilities = [nil,nil] @@ -122,6 +130,29 @@ class Model end end + # Get Euclidean distance + # @param [Array>] + # @return [Float] + def euclid variables + sq = variables[0].zip(variables[1]).map{|a,b| (a - b) ** 2} + Math.sqrt(sq.inject(0) {|s,c| s + c}) + end + + # Get Tanimoto similarity + # @param [Array>] + # @return [Float] + def tanimoto fingerprints + ( fingerprints[0] & fingerprints[1] ).size/( fingerprints[0] | fingerprints[1] ).size.to_f + end + + # Get cosine similarity + # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + # @param [Array>] + # @return [Float] + def cosine variables + variables[0].dot_product(variables[1]) / (variables[0].magnitude * variables[1].magnitude) + end + def weighted_majority_vote neighbors w = [neighbors.select{|n| n[1] == 0}.collect{|n| n[2]}, neighbors.select{|n| n[1] == 1}.collect{|n| n[2]}] weights_sum = neighbors.collect{|n| n[2]}.sum.to_f -- cgit v1.2.3