From 1dcd741a5bff8dc41abf0840f59031eb557ff230 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 8 Mar 2021 21:25:45 +0100
Subject: neighbor selction adjusted, summary for high-confidence predictions

---
 bin/classification-summary.rb | 34 ++++++++++++++++++++++++++++------
 bin/crossvalidation-folds.rb  |  9 +--------
 lib/model.rb                  | 26 +++++++++++---------------
 3 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/bin/classification-summary.rb b/bin/classification-summary.rb
index c6755a1..45ffb29 100755
--- a/bin/classification-summary.rb
+++ b/bin/classification-summary.rb
@@ -3,6 +3,7 @@ require_relative "../lib/lazar"
 #stat = ClassificationStatistics.new ARGV[0]
 #stat.summary
 dir = File.join(File.dirname(ARGV[0]),"crossvalidation")
+thresh = ARGV[1].to_f
 folds = Dir[File.join(dir,"[0-9]*")]
 
 predictions = []
@@ -10,7 +11,11 @@ tp=0
 tn=0
 fp=0
 fn=0
-n=0
+hc_tp=0
+hc_tn=0
+hc_fp=0
+hc_fn=0
+#n=0
 experimental = {}
 
 lines = File.readlines(File.join(ARGV[0])) 
@@ -28,39 +33,48 @@ File.open(File.join(dir,"predictions.csv"),"w+") do |f|
     pred.each do |prediction|
       smi = prediction[0]
       exp = experimental[smi]
+      maxsim = prediction[5].to_f
+      v = "NA"
       unless exp.nil? or prediction[2].empty? or exp.empty?
         p = prediction[2].to_i
-        n+=1
-        v = "NA"
+        #n+=1
         exp.each do |e|
           if p and e
             if p == 1 and e == 1
               v = "TP"
               tp+=1
+              hc_tp+=1 if maxsim > thresh
             elsif p == 0 and e == 0
               v = "TN"
               tn+=1
+              hc_tn+=1 if maxsim > thresh
             elsif p == 1 and e == 0
               v = "FP"
               fp+=1
+              hc_fp+=1 if maxsim > thresh
             elsif p == 0 and e == 1
               v = "FN"
               fn+=1
+              hc_fn+=1 if maxsim > thresh
             end
           end
           predictions << v
         end
-      f.puts([smi,v].join(","))
       end
+      f.puts([smi,v,maxsim].join(","))
     end
   end
 end
 
-File.open(File.join(dir,"confusion-matrix.csv"),"w+") do |f|
+File.open(File.join(dir,"confusion-matrix-all.csv"),"w+") do |f|
   f.puts "#{tp},#{fp}\n#{fn},#{tn}"
 end
 
-File.open(File.join(dir,"summary.csv"),"w+") do |f|
+File.open(File.join(dir,"confusion-matrix-high-confidence.csv"),"w+") do |f|
+  f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}"
+end
+
+File.open(File.join(dir,"summary-all.csv"),"w+") do |f|
   f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}"
   f.puts "true_positive_rate,#{tp/(tp+fn).to_f}"
   f.puts "true_negative_rate,#{tn/(tn+fp).to_f}"
@@ -68,3 +82,11 @@ File.open(File.join(dir,"summary.csv"),"w+") do |f|
   f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}"
 end
 
+File.open(File.join(dir,"summary-high-confidence.csv"),"w+") do |f|
+  f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}"
+  f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}"
+  f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}"
+  f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}"
+  f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}"
+end
+
diff --git a/bin/crossvalidation-folds.rb b/bin/crossvalidation-folds.rb
index 0c765f7..16a4103 100755
--- a/bin/crossvalidation-folds.rb
+++ b/bin/crossvalidation-folds.rb
@@ -17,6 +17,7 @@ start = 0
       :train => indices-test_idxs,
       :test => test_idxs
     }
+    p idxs
     start = last+1
     # write training/test data
     cv_dir = File.join(File.dirname(ARGV[0]),"crossvalidation",i.to_s)
@@ -39,14 +40,6 @@ start = 0
             f.puts t.join(",")
           end
         end
-        file = File.join(cv_dir,t.to_s+"-experimental.csv")
-        File.open(file,"w+") do |f|
-          f.puts (["Canonical SMILES", model.dependent_variable_name]).join(",")
-          idx.collect{|i| model.train[i]}.each do |t|
-            # TODO fix
-            f.puts t[0..1].join(",")
-          end
-        end
       end
     end
     Process.waitall
diff --git a/lib/model.rb b/lib/model.rb
index c4ca1f3..d62d889 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -20,7 +20,7 @@ class Model
     puts "Determining model type."
     if dependent_variables.uniq == ["1","0"]
       @dependent_variable_type = "binary"
-      @train.each {|t| t[1] == "1" ? t[1] = true : t[1] = false }
+      @train.each {|t| t[1] == t[1].to_i}
     elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true]
       @dependent_variable_type = "numeric"
       @train.each {|t| t[1] = t[1].to_f }
@@ -38,7 +38,6 @@ class Model
   end
 
   def predict file
-=begin
     model_type
     puts "Reading prediction data from #{file}."
     @batch = File.readlines(file).collect{|l| l.chomp.split(",")}
@@ -48,6 +47,7 @@ class Model
       raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}."
     end
     if @independent_variable_type == "numeric"
+      @minsim = [0.9,0.7]
       @batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}}
       select(@independent_variable_names & @batch_independent_variable_names)
       File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f|
@@ -62,7 +62,6 @@ class Model
       end
       puts "Feature selection and scaling."
       puts `Rscript #{File.join(File.dirname(__FILE__),"..","bin","preprocessing.R")} #{File.join(File.dirname(file),"common-variables.csv")} #{File.join(File.dirname(file),"scaled-variables.csv")}`
-=end
       puts "Reading scaled features."
       lines = File.readlines(File.join(File.dirname(file),"scaled-variables.csv"))
       @independent_variable_names = @batch_independent_variable_names = lines.shift.chomp.split(",")[2..-1]
@@ -82,14 +81,15 @@ class Model
       File.open(file.sub(".csv","-prediction.csv"),"w+") do |f|
         f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",")
         @scaled_batch.each do |pred|
-          classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Distance.euclid([row[2..-1],pred[1..-1]])]}).each do |pred|
-          #classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
+          classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
             f.puts pred.join(",")
-            puts pred.join(",")
+            #puts pred.join(",")
           end
         end
       end
-    #end
+    elsif @independent_variable_type == "set"
+      @minsim = [0.5,0.2]
+    end
   end
 
   def select variable_names
@@ -104,15 +104,11 @@ class Model
   def classification smiles, train
     experimental = train.select{|row| row[0] == smiles}
     train = train-experimental
-    #train.select!{|row| row[2] > 0.8}
-    #train.select!{|row| row[2] > 0.5} if train.size < 2
-    #train.select!{|row| row[2] > 0.5}
-    #train.select!{|row| row[2] > -1.0} if train.size < 2
-    puts "=="
-    puts smiles
-    puts train.sort_by{|r| r[2]}[0..10].collect{|r| r.join(",")}.join("\n")
+    n = train.select{|row| row[2] > @minsim[0]}
+    n = train.select!{|row| row[2] > @minsim[1]} if n.size < 2
+    train = n
+    #puts train.sort_by{|r| r[2]}[0..5].collect{|r| r.join(",")}.join("\n")
     #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n")
-    puts "--"
     if train.size < 2
       classification = nil
       probabilities = [nil,nil]
-- 
cgit v1.2.3