From 83591831c6e36c36d87159acba6afdfedab95522 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 18 Mar 2021 16:48:36 +0100
Subject: fingerprint predictions added

---
 lib/model.rb | 55 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 12 deletions(-)

(limited to 'lib/model.rb')

diff --git a/lib/model.rb b/lib/model.rb
index d62d889..1726690 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -6,7 +6,7 @@ end
 
 class Model 
 
-  attr_reader :train, :dependent_variable_name, :independent_variable_names
+  attr_reader :train, :dependent_variable_name, :independent_variable_names, :minsim
 
   def initialize file
     puts "Reading training data from #{file}."
@@ -14,13 +14,14 @@ class Model
     header = @train.shift
     @dependent_variable_name = header[1]
     @independent_variable_names = header[2..-1]
+    model_type
   end
 
   def model_type
     puts "Determining model type."
-    if dependent_variables.uniq == ["1","0"]
+    if dependent_variables.uniq.sort == ["0","1"]
       @dependent_variable_type = "binary"
-      @train.each {|t| t[1] == t[1].to_i}
+      @train.each {|t| t[1] = t[1].to_i}
     elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true]
       @dependent_variable_type = "numeric"
       @train.each {|t| t[1] = t[1].to_f }
@@ -29,25 +30,25 @@ class Model
     end
     if independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false]
       @independent_variable_type = "set"
+      @minsim = [0.5,0.2]
     elsif independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true]
       @independent_variable_type = "numeric"
       @train.each {|t| t[2..-1] = t[2..-1].collect{|v| v = v.to_f}}
+      @minsim = [0.9,0.7]
     else
       raise "Incorrect model independent variables [#{independent_variables.flatten.uniq}]. Must be either a set (fingerprints) or numeric."
     end
   end
 
   def predict file
-    model_type
     puts "Reading prediction data from #{file}."
     @batch = File.readlines(file).collect{|l| l.chomp.split(",")}
     header = @batch.shift
-    @batch_independent_variable_names = header[1..-1]
-    unless (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric")
+    unless (@batch.collect{|b| b[1..-1]}.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric")
       raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}."
     end
     if @independent_variable_type == "numeric"
-      @minsim = [0.9,0.7]
+      @batch_independent_variable_names = header[1..-1]
       @batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}}
       select(@independent_variable_names & @batch_independent_variable_names)
       File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f|
@@ -81,14 +82,22 @@ class Model
       File.open(file.sub(".csv","-prediction.csv"),"w+") do |f|
         f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",")
         @scaled_batch.each do |pred|
-          classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
+          classification(pred[0], @scaled_train.collect{|row| row[0..1] + [cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
             f.puts pred.join(",")
-            #puts pred.join(",")
           end
         end
       end
     elsif @independent_variable_type == "set"
-      @minsim = [0.5,0.2]
+      puts "Predicting #{file}."
+      File.open(file.sub(".csv","-prediction.csv"),"w+") do |f|
+        f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",")
+        @batch.each do |fingerprints|
+          smi = fingerprints.shift
+          classification(smi, @train.collect{|row| row[0..1] + [tanimoto([row[2..-1],fingerprints])]}).each do |pred|
+            f.puts pred.join(",")
+          end
+        end
+      end
     end
   end
 
@@ -107,8 +116,7 @@ class Model
     n = train.select{|row| row[2] > @minsim[0]}
     n = train.select!{|row| row[2] > @minsim[1]} if n.size < 2
     train = n
-    #puts train.sort_by{|r| r[2]}[0..5].collect{|r| r.join(",")}.join("\n")
-    #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n")
+    #p train.sort_by{|r| r[2]}[0..5]#.collect{|r| r.join(",")}.join("\n")
     if train.size < 2
       classification = nil
       probabilities = [nil,nil]
@@ -122,6 +130,29 @@ class Model
     end
   end
 
+  # Get Euclidean distance 
+  # @param [Array<Array<Float>>]
+  # @return [Float]
+  def euclid variables
+    sq = variables[0].zip(variables[1]).map{|a,b| (a - b) ** 2}
+    Math.sqrt(sq.inject(0) {|s,c| s + c})
+  end
+
+  # Get Tanimoto similarity
+  # @param [Array<Array<String>>]
+  # @return [Float]
+  def tanimoto fingerprints
+    ( fingerprints[0] & fingerprints[1] ).size/( fingerprints[0] | fingerprints[1] ).size.to_f
+  end
+
+  # Get cosine similarity
+  #   http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+  # @param [Array<Array<Float>>]
+  # @return [Float]
+  def cosine variables
+    variables[0].dot_product(variables[1]) / (variables[0].magnitude * variables[1].magnitude)
+  end
+
   def weighted_majority_vote neighbors
     w = [neighbors.select{|n| n[1] == 0}.collect{|n| n[2]}, neighbors.select{|n| n[1] == 1}.collect{|n| n[2]}]
     weights_sum = neighbors.collect{|n| n[2]}.sum.to_f
-- 
cgit v1.2.3