From 83591831c6e36c36d87159acba6afdfedab95522 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 18 Mar 2021 16:48:36 +0100 Subject: fingerprint predictions added --- bin/batch-prediction.rb | 4 - bin/batch_padel_classification.rb | 20 ---- bin/classification-summary.rb | 92 ---------------- bin/confusion_matrix.rb | 4 - bin/crossvalidation-folds.rb | 47 -------- bin/crossvalidation-predictions.rb | 13 --- bin/crossvalidation.rb | 15 --- bin/export-fingerprints.rb | 23 ---- bin/fingerprint_independent_variables.rb | 4 - bin/lazar | 182 +++++++++++++++++++++++++++++++ bin/scale_independent_variables.rb | 4 - 11 files changed, 182 insertions(+), 226 deletions(-) delete mode 100755 bin/batch-prediction.rb delete mode 100755 bin/batch_padel_classification.rb delete mode 100755 bin/classification-summary.rb delete mode 100755 bin/confusion_matrix.rb delete mode 100755 bin/crossvalidation-folds.rb delete mode 100755 bin/crossvalidation-predictions.rb delete mode 100755 bin/crossvalidation.rb delete mode 100755 bin/export-fingerprints.rb delete mode 100755 bin/fingerprint_independent_variables.rb create mode 100755 bin/lazar delete mode 100755 bin/scale_independent_variables.rb (limited to 'bin') diff --git a/bin/batch-prediction.rb b/bin/batch-prediction.rb deleted file mode 100755 index 770bc60..0000000 --- a/bin/batch-prediction.rb +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -model = Model.new ARGV[0] -model.predict ARGV[1] diff --git a/bin/batch_padel_classification.rb b/bin/batch_padel_classification.rb deleted file mode 100755 index 6d05907..0000000 --- a/bin/batch_padel_classification.rb +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -model = ClassificationModel.new ARGV[0] -model.predict_file ARGV[1] -=begin -File.read(ARGV[1]).each_line do |line| - if line.match(/Name/i) - file_descriptors = line.chomp.split(";") - model_descriptors = File.read(File.join(ARGV[0],"independent_variable_names")).chomp.split(",").collect{|d| d.gsub('"','')} - common_descriptors = model_descriptors & file_descriptors - puts "ID,SMILES,experimental,classification,probability(0),probability(1),max_similarity,nr_neighbors" - else - descriptor_values = [] - line.chomp.split(",") - descriptor_values.shift - puts ([id] + model.predict_smiles(smi)).join(",") - end -end - -=end diff --git a/bin/classification-summary.rb b/bin/classification-summary.rb deleted file mode 100755 index 45ffb29..0000000 --- a/bin/classification-summary.rb +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -#stat = ClassificationStatistics.new ARGV[0] -#stat.summary -dir = File.join(File.dirname(ARGV[0]),"crossvalidation") -thresh = ARGV[1].to_f -folds = Dir[File.join(dir,"[0-9]*")] - -predictions = [] -tp=0 -tn=0 -fp=0 -fn=0 -hc_tp=0 -hc_tn=0 -hc_fp=0 -hc_fn=0 -#n=0 -experimental = {} - -lines = File.readlines(File.join(ARGV[0])) -lines.shift -lines.each do |line| - items = line.chomp.split(',') - experimental[items[0]] ||= [] - experimental[items[0]] << items[1].to_i -end - -File.open(File.join(dir,"predictions.csv"),"w+") do |f| - folds.each do |fold| - pred = File.readlines(File.join(fold,"test-prediction.csv")).collect{|row| row.chomp.split(",")} - pred.shift - pred.each do |prediction| - smi = prediction[0] - exp = experimental[smi] - maxsim = prediction[5].to_f - v = "NA" - unless exp.nil? or prediction[2].empty? or exp.empty? - p = prediction[2].to_i - #n+=1 - exp.each do |e| - if p and e - if p == 1 and e == 1 - v = "TP" - tp+=1 - hc_tp+=1 if maxsim > thresh - elsif p == 0 and e == 0 - v = "TN" - tn+=1 - hc_tn+=1 if maxsim > thresh - elsif p == 1 and e == 0 - v = "FP" - fp+=1 - hc_fp+=1 if maxsim > thresh - elsif p == 0 and e == 1 - v = "FN" - fn+=1 - hc_fn+=1 if maxsim > thresh - end - end - predictions << v - end - end - f.puts([smi,v,maxsim].join(",")) - end - end -end - -File.open(File.join(dir,"confusion-matrix-all.csv"),"w+") do |f| - f.puts "#{tp},#{fp}\n#{fn},#{tn}" -end - -File.open(File.join(dir,"confusion-matrix-high-confidence.csv"),"w+") do |f| - f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}" -end - -File.open(File.join(dir,"summary-all.csv"),"w+") do |f| - f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}" - f.puts "true_positive_rate,#{tp/(tp+fn).to_f}" - f.puts "true_negative_rate,#{tn/(tn+fp).to_f}" - f.puts "positive_predictive_value,#{tp/(tp+fp).to_f}" - f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}" -end - -File.open(File.join(dir,"summary-high-confidence.csv"),"w+") do |f| - f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}" - f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}" - f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}" - f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}" - f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}" -end - diff --git a/bin/confusion_matrix.rb b/bin/confusion_matrix.rb deleted file mode 100755 index 789262d..0000000 --- a/bin/confusion_matrix.rb +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -stat = ClassificationStatistics.new ARGV[0] -stat.confusion_matrix diff --git a/bin/crossvalidation-folds.rb b/bin/crossvalidation-folds.rb deleted file mode 100755 index 16a4103..0000000 --- a/bin/crossvalidation-folds.rb +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -model = Model.new ARGV[0] -ARGV[1] ? folds = ARGV[1].to_i : folds = 10 -nr_instances = model.train.size -indices = (0..nr_instances-1).to_a.shuffle -mid = (nr_instances/folds) -start = 0 -0.upto(folds-1) do |i| - fork do - # split train data - puts "Creating fold #{i}" - last = start+mid - last = last-1 unless nr_instances%folds > i - test_idxs = indices[start..last] || [] - idxs = { - :train => indices-test_idxs, - :test => test_idxs - } - p idxs - start = last+1 - # write training/test data - cv_dir = File.join(File.dirname(ARGV[0]),"crossvalidation",i.to_s) - idxs.each do |t,idx| - file = File.join(cv_dir,t.to_s+".csv") - `mkdir -p #{File.dirname file}` - case t - when :train - File.open(file,"w+") do |f| - f.puts (["Canonical SMILES",model.dependent_variable_name] + model.independent_variable_names).join(",") - idx.collect{|i| model.train[i]}.each do |t| - f.puts t.join(",") - end - end - when :test - File.open(file,"w+") do |f| - f.puts (["Canonical SMILES"] + model.independent_variable_names).join(",") - idx.collect{|i| model.train[i]}.each do |t| - t.delete_at(1) - f.puts t.join(",") - end - end - end - end - Process.waitall - end -end diff --git a/bin/crossvalidation-predictions.rb b/bin/crossvalidation-predictions.rb deleted file mode 100755 index 55ae5a1..0000000 --- a/bin/crossvalidation-predictions.rb +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" - -t = Time.now -Dir["#{File.join(ARGV[0],'[0-9]')}"].each do |fold| - fork do - puts "Crossvalidation #{fold} started" - model = Model.new File.join(fold,"train.csv") - model.predict File.join(fold,"test.csv") - end -end -Process.waitall -puts "Crossvalidation: #{(Time.now-t)/60} min" diff --git a/bin/crossvalidation.rb b/bin/crossvalidation.rb deleted file mode 100755 index b7cfdd7..0000000 --- a/bin/crossvalidation.rb +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -dir = ARGV[0] -dependent_variable_type = File.read(File.join(dir, "dependent-variable-type")).chomp -independent_variable_type = File.read(File.join(dir, "independent-variable-type")).chomp -if dependent_variable_type == "binary" and independent_variable_type == "binary" - model = TanimotoClassificationModel.new dir -elsif dependent_variable_type == "binary" and independent_variable_type == "numeric" - model = CosineClassificationModel.new dir -elsif dependent_variable_type == "numeric" and independent_variable_type == "binary" - model = TanimotoRegressionModel.new dir -elsif dependent_variable_type == "numeric" and independent_variable_type == "numeric" - model = CosineRegressionModel.new dir -end -model.crossvalidation diff --git a/bin/export-fingerprints.rb b/bin/export-fingerprints.rb deleted file mode 100755 index 0e1e934..0000000 --- a/bin/export-fingerprints.rb +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env ruby - -dir = ARGV[0] -fp = [] -endpoint_name = File.read(File.join dir,"dependent_variable_name").chomp -endpoints = File.open(File.join dir,"dependent_variables").readlines.collect{|v| v.to_i} -endpoint_values = File.open(File.join dir,"dependent_variable_values").readlines -smiles = [] - -File.open(File.join dir,"independent_variables").each_line do |l| - l = l.chomp.split(",") - smiles << l.shift - fp << l -end - -fp_names = fp.flatten.sort.uniq -header = ["Canonical SMILES"]+fp_names+[endpoint_name] -puts header.join(",") - -(0..smiles.size-1).each do |i| - line = [smiles[i]]+fp_names.collect{|n| fp[i].include?(n) ? 1 : 0}+[endpoint_values[endpoints[i]]] - puts line.join(",") -end diff --git a/bin/fingerprint_independent_variables.rb b/bin/fingerprint_independent_variables.rb deleted file mode 100755 index 7dea239..0000000 --- a/bin/fingerprint_independent_variables.rb +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -dataset = Dataset.new ARGV[0] -dataset.fingerprint_independent_variables ARGV[0] diff --git a/bin/lazar b/bin/lazar new file mode 100755 index 0000000..e603b4c --- /dev/null +++ b/bin/lazar @@ -0,0 +1,182 @@ +#!/usr/bin/env ruby +require 'optparse' +require_relative '../lib/lazar' + +ARGV << '-h' if ARGV.empty? +options = {} +options[:folds] = 10 +options[:thresholds] = [0.5,0.2] + +OptionParser.new do |opts| + opts.banner = "Usage: lazar -t TRAIN -x|-p descriptors [options]" + opts.on( '-h', '--help', 'Display this screen' ) do + puts opts + exit + end + opts.on( '-t TRAIN', '-train TRAIN', "Training data in csv format (required). Type 'lazar -f' for format specifications." ) do |t| + options[:train] = t + end + opts.on( '-p descriptors', '--predict descriptors', "Prediction data in csv format. Type 'lazar -f' for format specifications.") do |p| + options[:predict] = p + end + opts.on( '-x', '--crossvalidation', "Run crossvalidation." ) do |c| + options[:cv] = true + end + opts.on( '-f folds', '--folds folds', Integer, "Change crossvalidation folds (default: #{options[:folds]})." ) do |f| + options[:folds] = f + end + opts.on( '-f', '--formats', "Describe input and output formats" ) do |f| + raise OptionParser::InvalidArgument, "Format description not yet implemented." + end +# opts.on( '-d', '--daemon', "Run as daemon in background" ) do |f| +# raise OptionParser::InvalidArgument, "Daemon mode not yet implemented" +# end +end.parse! + +raise OptionParser::MissingArgument, "Training data is required. Type 'lazar -h' for help." if options[:train].nil? +raise OptionParser::InvalidArgument, "Training data file #{options[:train]} does not exist. Type 'lazar -h' for help." unless File.exists? options[:train] +raise OptionParser::InvalidOption, "Choose either --predict or --crossvalidation. Type 'lazar -h' for help." if options[:predict] and options[:cv] +raise OptionParser::InvalidOption, "One of the --predict or --crossvalidation options is required. Type 'lazar -h' for help." unless options[:predict] or options[:cv] +raise OptionParser::InvalidArgument, "Prediction descriptor file #{options[:predict]} does not exist. Type 'lazar -h' for help." if options[:predict] and !File.exists? options[:predict] + +model = Model.new options[:train] + +if options[:predict] # batch predictions + model.predict options[:predict] + +elsif options[:cv] # crossvalidation + + # create folds + cv_dir = File.join(File.dirname(options[:train]),"crossvalidation") + folds = (0..options[:folds]-1).collect{|i| File.join(cv_dir,i.to_s)} + nr_instances = model.train.size + indices = (0..nr_instances-1).to_a.shuffle + mid = (nr_instances/options[:folds]) + start = 0 + 0.upto(options[:folds]-1) do |i| + + # split train data + puts "Creating fold #{i}" + last = start+mid + last = last-1 unless nr_instances%options[:folds] > i + test_idxs = indices[start..last] || [] + idxs = { + :train => indices-test_idxs, + :test => test_idxs + } + start = last+1 + + # write training/test data + idxs.each do |t,idx| + file = File.join(cv_dir,i.to_s,t.to_s+".csv") + `mkdir -p #{File.dirname file}` + case t + when :train + File.open(file,"w+") do |f| + f.puts (["Canonical SMILES",model.dependent_variable_name] + model.independent_variable_names).join(",") + idx.collect{|i| model.train[i]}.each do |t| + f.puts t.join(",") + end + end + when :test + File.open(file,"w+") do |f| + f.puts (["Canonical SMILES"] + model.independent_variable_names).join(",") + idx.collect{|i| model.train[i]}.each do |t| + o = t.clone # keep model.train intact + o.delete_at(1) + f.puts o.join(",") + end + end + end + end + end + + # crossvalidation predictions + t = Time.now + folds.each do |fold| + fork do + puts "Crossvalidation #{fold} started" + m = Model.new File.join(fold,"train.csv") + m.predict File.join(fold,"test.csv") + end + end + Process.waitall + puts "Crossvalidation: #{(Time.now-t)/60} min" + + # crossvalidation summaries + + predictions = [] + tp=0 + tn=0 + fp=0 + fn=0 + hc_tp=0 + hc_tn=0 + hc_fp=0 + hc_fn=0 + + File.open(File.join(cv_dir,"predictions.csv"),"w+") do |f| + folds.each do |fold| + pred = File.readlines(File.join(fold,"test-prediction.csv")).collect{|row| row.chomp.split(",")} + pred.shift + pred.each do |prediction| + smi = prediction[0] + exp = model.train.select{|t| t[0] == smi}.collect{|t| t[1].to_i} + maxsim = prediction[5].to_f + v = "NA" + unless exp.nil? or prediction[2].empty? or exp.empty? + p = prediction[2].to_i + exp.each do |e| + if p and e + if p == 1 and e == 1 + v = "TP" + tp+=1 + hc_tp+=1 if maxsim > model.minsim.max + elsif p == 0 and e == 0 + v = "TN" + tn+=1 + hc_tn+=1 if maxsim > model.minsim.max + elsif p == 1 and e == 0 + v = "FP" + fp+=1 + hc_fp+=1 if maxsim > model.minsim.max + elsif p == 0 and e == 1 + v = "FN" + fn+=1 + hc_fn+=1 if maxsim > model.minsim.max + end + end + predictions << v + end + end + f.puts([smi,v,maxsim].join(",")) + end + end + end + + File.open(File.join(cv_dir,"confusion-matrix-all.csv"),"w+") do |f| + f.puts "#{tp},#{fp}\n#{fn},#{tn}" + end + + File.open(File.join(cv_dir,"confusion-matrix-high-confidence.csv"),"w+") do |f| + f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}" + end + + File.open(File.join(cv_dir,"summary-all.csv"),"w+") do |f| + f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}" + f.puts "true_positive_rate,#{tp/(tp+fn).to_f}" + f.puts "true_negative_rate,#{tn/(tn+fp).to_f}" + f.puts "positive_predictive_value,#{tp/(tp+fp).to_f}" + f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}" + end + + File.open(File.join(cv_dir,"summary-high-confidence.csv"),"w+") do |f| + f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}" + f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}" + f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}" + f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}" + f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}" + end + +end + diff --git a/bin/scale_independent_variables.rb b/bin/scale_independent_variables.rb deleted file mode 100755 index 1d7662a..0000000 --- a/bin/scale_independent_variables.rb +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env ruby -require_relative "../lib/lazar" -dataset = Dataset.new ARGV[0] -dataset.scale_independent_variables ARGV[0] -- cgit v1.2.3