From 1dcd741a5bff8dc41abf0840f59031eb557ff230 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 8 Mar 2021 21:25:45 +0100 Subject: neighbor selction adjusted, summary for high-confidence predictions --- bin/classification-summary.rb | 34 ++++++++++++++++++++++++++++------ bin/crossvalidation-folds.rb | 9 +-------- lib/model.rb | 26 +++++++++++--------------- 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/bin/classification-summary.rb b/bin/classification-summary.rb index c6755a1..45ffb29 100755 --- a/bin/classification-summary.rb +++ b/bin/classification-summary.rb @@ -3,6 +3,7 @@ require_relative "../lib/lazar" #stat = ClassificationStatistics.new ARGV[0] #stat.summary dir = File.join(File.dirname(ARGV[0]),"crossvalidation") +thresh = ARGV[1].to_f folds = Dir[File.join(dir,"[0-9]*")] predictions = [] @@ -10,7 +11,11 @@ tp=0 tn=0 fp=0 fn=0 -n=0 +hc_tp=0 +hc_tn=0 +hc_fp=0 +hc_fn=0 +#n=0 experimental = {} lines = File.readlines(File.join(ARGV[0])) @@ -28,39 +33,48 @@ File.open(File.join(dir,"predictions.csv"),"w+") do |f| pred.each do |prediction| smi = prediction[0] exp = experimental[smi] + maxsim = prediction[5].to_f + v = "NA" unless exp.nil? or prediction[2].empty? or exp.empty? p = prediction[2].to_i - n+=1 - v = "NA" + #n+=1 exp.each do |e| if p and e if p == 1 and e == 1 v = "TP" tp+=1 + hc_tp+=1 if maxsim > thresh elsif p == 0 and e == 0 v = "TN" tn+=1 + hc_tn+=1 if maxsim > thresh elsif p == 1 and e == 0 v = "FP" fp+=1 + hc_fp+=1 if maxsim > thresh elsif p == 0 and e == 1 v = "FN" fn+=1 + hc_fn+=1 if maxsim > thresh end end predictions << v end - f.puts([smi,v].join(",")) end + f.puts([smi,v,maxsim].join(",")) end end end -File.open(File.join(dir,"confusion-matrix.csv"),"w+") do |f| +File.open(File.join(dir,"confusion-matrix-all.csv"),"w+") do |f| f.puts "#{tp},#{fp}\n#{fn},#{tn}" end -File.open(File.join(dir,"summary.csv"),"w+") do |f| +File.open(File.join(dir,"confusion-matrix-high-confidence.csv"),"w+") do |f| + f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}" +end + +File.open(File.join(dir,"summary-all.csv"),"w+") do |f| f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}" f.puts "true_positive_rate,#{tp/(tp+fn).to_f}" f.puts "true_negative_rate,#{tn/(tn+fp).to_f}" @@ -68,3 +82,11 @@ File.open(File.join(dir,"summary.csv"),"w+") do |f| f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}" end +File.open(File.join(dir,"summary-high-confidence.csv"),"w+") do |f| + f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}" + f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}" + f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}" + f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}" + f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}" +end + diff --git a/bin/crossvalidation-folds.rb b/bin/crossvalidation-folds.rb index 0c765f7..16a4103 100755 --- a/bin/crossvalidation-folds.rb +++ b/bin/crossvalidation-folds.rb @@ -17,6 +17,7 @@ start = 0 :train => indices-test_idxs, :test => test_idxs } + p idxs start = last+1 # write training/test data cv_dir = File.join(File.dirname(ARGV[0]),"crossvalidation",i.to_s) @@ -39,14 +40,6 @@ start = 0 f.puts t.join(",") end end - file = File.join(cv_dir,t.to_s+"-experimental.csv") - File.open(file,"w+") do |f| - f.puts (["Canonical SMILES", model.dependent_variable_name]).join(",") - idx.collect{|i| model.train[i]}.each do |t| - # TODO fix - f.puts t[0..1].join(",") - end - end end end Process.waitall diff --git a/lib/model.rb b/lib/model.rb index c4ca1f3..d62d889 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -20,7 +20,7 @@ class Model puts "Determining model type." if dependent_variables.uniq == ["1","0"] @dependent_variable_type = "binary" - @train.each {|t| t[1] == "1" ? t[1] = true : t[1] = false } + @train.each {|t| t[1] == t[1].to_i} elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true] @dependent_variable_type = "numeric" @train.each {|t| t[1] = t[1].to_f } @@ -38,7 +38,6 @@ class Model end def predict file -=begin model_type puts "Reading prediction data from #{file}." @batch = File.readlines(file).collect{|l| l.chomp.split(",")} @@ -48,6 +47,7 @@ class Model raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}." end if @independent_variable_type == "numeric" + @minsim = [0.9,0.7] @batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}} select(@independent_variable_names & @batch_independent_variable_names) File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f| @@ -62,7 +62,6 @@ class Model end puts "Feature selection and scaling." puts `Rscript #{File.join(File.dirname(__FILE__),"..","bin","preprocessing.R")} #{File.join(File.dirname(file),"common-variables.csv")} #{File.join(File.dirname(file),"scaled-variables.csv")}` -=end puts "Reading scaled features." lines = File.readlines(File.join(File.dirname(file),"scaled-variables.csv")) @independent_variable_names = @batch_independent_variable_names = lines.shift.chomp.split(",")[2..-1] @@ -82,14 +81,15 @@ class Model File.open(file.sub(".csv","-prediction.csv"),"w+") do |f| f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",") @scaled_batch.each do |pred| - classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Distance.euclid([row[2..-1],pred[1..-1]])]}).each do |pred| - #classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred| + classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred| f.puts pred.join(",") - puts pred.join(",") + #puts pred.join(",") end end end - #end + elsif @independent_variable_type == "set" + @minsim = [0.5,0.2] + end end def select variable_names @@ -104,15 +104,11 @@ class Model def classification smiles, train experimental = train.select{|row| row[0] == smiles} train = train-experimental - #train.select!{|row| row[2] > 0.8} - #train.select!{|row| row[2] > 0.5} if train.size < 2 - #train.select!{|row| row[2] > 0.5} - #train.select!{|row| row[2] > -1.0} if train.size < 2 - puts "==" - puts smiles - puts train.sort_by{|r| r[2]}[0..10].collect{|r| r.join(",")}.join("\n") + n = train.select{|row| row[2] > @minsim[0]} + n = train.select!{|row| row[2] > @minsim[1]} if n.size < 2 + train = n + #puts train.sort_by{|r| r[2]}[0..5].collect{|r| r.join(",")}.join("\n") #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n") - puts "--" if train.size < 2 classification = nil probabilities = [nil,nil] -- cgit v1.2.3