summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2021-03-08 21:25:45 +0100
committerChristoph Helma <helma@in-silico.ch>2021-03-08 21:25:45 +0100
commit1dcd741a5bff8dc41abf0840f59031eb557ff230 (patch)
treea76547c2cecbace4468a1a6ea392db880f95c6b0
parent08e5768e9a446db8ab95152d2e9403a0e635ec63 (diff)
neighbor selction adjusted, summary for high-confidence predictions
-rwxr-xr-xbin/classification-summary.rb34
-rwxr-xr-xbin/crossvalidation-folds.rb9
-rw-r--r--lib/model.rb26
3 files changed, 40 insertions, 29 deletions
diff --git a/bin/classification-summary.rb b/bin/classification-summary.rb
index c6755a1..45ffb29 100755
--- a/bin/classification-summary.rb
+++ b/bin/classification-summary.rb
@@ -3,6 +3,7 @@ require_relative "../lib/lazar"
#stat = ClassificationStatistics.new ARGV[0]
#stat.summary
dir = File.join(File.dirname(ARGV[0]),"crossvalidation")
+thresh = ARGV[1].to_f
folds = Dir[File.join(dir,"[0-9]*")]
predictions = []
@@ -10,7 +11,11 @@ tp=0
tn=0
fp=0
fn=0
-n=0
+hc_tp=0
+hc_tn=0
+hc_fp=0
+hc_fn=0
+#n=0
experimental = {}
lines = File.readlines(File.join(ARGV[0]))
@@ -28,39 +33,48 @@ File.open(File.join(dir,"predictions.csv"),"w+") do |f|
pred.each do |prediction|
smi = prediction[0]
exp = experimental[smi]
+ maxsim = prediction[5].to_f
+ v = "NA"
unless exp.nil? or prediction[2].empty? or exp.empty?
p = prediction[2].to_i
- n+=1
- v = "NA"
+ #n+=1
exp.each do |e|
if p and e
if p == 1 and e == 1
v = "TP"
tp+=1
+ hc_tp+=1 if maxsim > thresh
elsif p == 0 and e == 0
v = "TN"
tn+=1
+ hc_tn+=1 if maxsim > thresh
elsif p == 1 and e == 0
v = "FP"
fp+=1
+ hc_fp+=1 if maxsim > thresh
elsif p == 0 and e == 1
v = "FN"
fn+=1
+ hc_fn+=1 if maxsim > thresh
end
end
predictions << v
end
- f.puts([smi,v].join(","))
end
+ f.puts([smi,v,maxsim].join(","))
end
end
end
-File.open(File.join(dir,"confusion-matrix.csv"),"w+") do |f|
+File.open(File.join(dir,"confusion-matrix-all.csv"),"w+") do |f|
f.puts "#{tp},#{fp}\n#{fn},#{tn}"
end
-File.open(File.join(dir,"summary.csv"),"w+") do |f|
+File.open(File.join(dir,"confusion-matrix-high-confidence.csv"),"w+") do |f|
+ f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}"
+end
+
+File.open(File.join(dir,"summary-all.csv"),"w+") do |f|
f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}"
f.puts "true_positive_rate,#{tp/(tp+fn).to_f}"
f.puts "true_negative_rate,#{tn/(tn+fp).to_f}"
@@ -68,3 +82,11 @@ File.open(File.join(dir,"summary.csv"),"w+") do |f|
f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}"
end
+File.open(File.join(dir,"summary-high-confidence.csv"),"w+") do |f|
+ f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}"
+ f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}"
+ f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}"
+ f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}"
+ f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}"
+end
+
diff --git a/bin/crossvalidation-folds.rb b/bin/crossvalidation-folds.rb
index 0c765f7..16a4103 100755
--- a/bin/crossvalidation-folds.rb
+++ b/bin/crossvalidation-folds.rb
@@ -17,6 +17,7 @@ start = 0
:train => indices-test_idxs,
:test => test_idxs
}
+ p idxs
start = last+1
# write training/test data
cv_dir = File.join(File.dirname(ARGV[0]),"crossvalidation",i.to_s)
@@ -39,14 +40,6 @@ start = 0
f.puts t.join(",")
end
end
- file = File.join(cv_dir,t.to_s+"-experimental.csv")
- File.open(file,"w+") do |f|
- f.puts (["Canonical SMILES", model.dependent_variable_name]).join(",")
- idx.collect{|i| model.train[i]}.each do |t|
- # TODO fix
- f.puts t[0..1].join(",")
- end
- end
end
end
Process.waitall
diff --git a/lib/model.rb b/lib/model.rb
index c4ca1f3..d62d889 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -20,7 +20,7 @@ class Model
puts "Determining model type."
if dependent_variables.uniq == ["1","0"]
@dependent_variable_type = "binary"
- @train.each {|t| t[1] == "1" ? t[1] = true : t[1] = false }
+ @train.each {|t| t[1] == t[1].to_i}
elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true]
@dependent_variable_type = "numeric"
@train.each {|t| t[1] = t[1].to_f }
@@ -38,7 +38,6 @@ class Model
end
def predict file
-=begin
model_type
puts "Reading prediction data from #{file}."
@batch = File.readlines(file).collect{|l| l.chomp.split(",")}
@@ -48,6 +47,7 @@ class Model
raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}."
end
if @independent_variable_type == "numeric"
+ @minsim = [0.9,0.7]
@batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}}
select(@independent_variable_names & @batch_independent_variable_names)
File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f|
@@ -62,7 +62,6 @@ class Model
end
puts "Feature selection and scaling."
puts `Rscript #{File.join(File.dirname(__FILE__),"..","bin","preprocessing.R")} #{File.join(File.dirname(file),"common-variables.csv")} #{File.join(File.dirname(file),"scaled-variables.csv")}`
-=end
puts "Reading scaled features."
lines = File.readlines(File.join(File.dirname(file),"scaled-variables.csv"))
@independent_variable_names = @batch_independent_variable_names = lines.shift.chomp.split(",")[2..-1]
@@ -82,14 +81,15 @@ class Model
File.open(file.sub(".csv","-prediction.csv"),"w+") do |f|
f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",")
@scaled_batch.each do |pred|
- classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Distance.euclid([row[2..-1],pred[1..-1]])]}).each do |pred|
- #classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
+ classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
f.puts pred.join(",")
- puts pred.join(",")
+ #puts pred.join(",")
end
end
end
- #end
+ elsif @independent_variable_type == "set"
+ @minsim = [0.5,0.2]
+ end
end
def select variable_names
@@ -104,15 +104,11 @@ class Model
def classification smiles, train
experimental = train.select{|row| row[0] == smiles}
train = train-experimental
- #train.select!{|row| row[2] > 0.8}
- #train.select!{|row| row[2] > 0.5} if train.size < 2
- #train.select!{|row| row[2] > 0.5}
- #train.select!{|row| row[2] > -1.0} if train.size < 2
- puts "=="
- puts smiles
- puts train.sort_by{|r| r[2]}[0..10].collect{|r| r.join(",")}.join("\n")
+ n = train.select{|row| row[2] > @minsim[0]}
+ n = train.select!{|row| row[2] > @minsim[1]} if n.size < 2
+ train = n
+ #puts train.sort_by{|r| r[2]}[0..5].collect{|r| r.join(",")}.join("\n")
#puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n")
- puts "--"
if train.size < 2
classification = nil
probabilities = [nil,nil]