11 files changed, 182 insertions, 226 deletions
diff --git a/bin/batch-prediction.rb b/bin/batch-prediction.rb
deleted file mode 100755
index 770bc60..0000000
--- a/bin/batch-prediction.rb
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-model = Model.new ARGV[0]
-model.predict ARGV[1]
diff --git a/bin/batch_padel_classification.rb b/bin/batch_padel_classification.rb
deleted file mode 100755
index 6d05907..0000000
--- a/bin/batch_padel_classification.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-model = ClassificationModel.new ARGV[0]
-model.predict_file ARGV[1]
-=begin
-File.read(ARGV[1]).each_line do |line|
-  if line.match(/Name/i)
-    file_descriptors = line.chomp.split(";")
-    model_descriptors = File.read(File.join(ARGV[0],"independent_variable_names")).chomp.split(",").collect{|d| d.gsub('"','')}
-    common_descriptors = model_descriptors & file_descriptors
-    puts "ID,SMILES,experimental,classification,probability(0),probability(1),max_similarity,nr_neighbors"
-  else
-    descriptor_values = []
-      line.chomp.split(",")
-    descriptor_values.shift
-    puts ([id] + model.predict_smiles(smi)).join(",")
-  end
-end
-
-=end
diff --git a/bin/classification-summary.rb b/bin/classification-summary.rb
deleted file mode 100755
index 45ffb29..0000000
--- a/bin/classification-summary.rb
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-#stat = ClassificationStatistics.new ARGV[0]
-#stat.summary
-dir = File.join(File.dirname(ARGV[0]),"crossvalidation")
-thresh = ARGV[1].to_f
-folds = Dir[File.join(dir,"[0-9]*")]
-
-predictions = []
-tp=0
-tn=0
-fp=0
-fn=0
-hc_tp=0
-hc_tn=0
-hc_fp=0
-hc_fn=0
-#n=0
-experimental = {}
-
-lines = File.readlines(File.join(ARGV[0])) 
-lines.shift
-lines.each do |line|
-  items = line.chomp.split(',')
-  experimental[items[0]] ||= []
-  experimental[items[0]] << items[1].to_i
-end
-
-File.open(File.join(dir,"predictions.csv"),"w+") do |f|
-  folds.each do |fold|
-    pred = File.readlines(File.join(fold,"test-prediction.csv")).collect{|row| row.chomp.split(",")}
-    pred.shift
-    pred.each do |prediction|
-      smi = prediction[0]
-      exp = experimental[smi]
-      maxsim = prediction[5].to_f
-      v = "NA"
-      unless exp.nil? or prediction[2].empty? or exp.empty?
-        p = prediction[2].to_i
-        #n+=1
-        exp.each do |e|
-          if p and e
-            if p == 1 and e == 1
-              v = "TP"
-              tp+=1
-              hc_tp+=1 if maxsim > thresh
-            elsif p == 0 and e == 0
-              v = "TN"
-              tn+=1
-              hc_tn+=1 if maxsim > thresh
-            elsif p == 1 and e == 0
-              v = "FP"
-              fp+=1
-              hc_fp+=1 if maxsim > thresh
-            elsif p == 0 and e == 1
-              v = "FN"
-              fn+=1
-              hc_fn+=1 if maxsim > thresh
-            end
-          end
-          predictions << v
-        end
-      end
-      f.puts([smi,v,maxsim].join(","))
-    end
-  end
-end
-
-File.open(File.join(dir,"confusion-matrix-all.csv"),"w+") do |f|
-  f.puts "#{tp},#{fp}\n#{fn},#{tn}"
-end
-
-File.open(File.join(dir,"confusion-matrix-high-confidence.csv"),"w+") do |f|
-  f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}"
-end
-
-File.open(File.join(dir,"summary-all.csv"),"w+") do |f|
-  f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}"
-  f.puts "true_positive_rate,#{tp/(tp+fn).to_f}"
-  f.puts "true_negative_rate,#{tn/(tn+fp).to_f}"
-  f.puts "positive_predictive_value,#{tp/(tp+fp).to_f}"
-  f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}"
-end
-
-File.open(File.join(dir,"summary-high-confidence.csv"),"w+") do |f|
-  f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}"
-  f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}"
-  f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}"
-  f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}"
-  f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}"
-end
-
diff --git a/bin/confusion_matrix.rb b/bin/confusion_matrix.rb
deleted file mode 100755
index 789262d..0000000
--- a/bin/confusion_matrix.rb
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-stat = ClassificationStatistics.new ARGV[0]
-stat.confusion_matrix
diff --git a/bin/crossvalidation-folds.rb b/bin/crossvalidation-folds.rb
deleted file mode 100755
index 16a4103..0000000
--- a/bin/crossvalidation-folds.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-model = Model.new ARGV[0]
-ARGV[1] ? folds = ARGV[1].to_i : folds = 10
-nr_instances = model.train.size
-indices = (0..nr_instances-1).to_a.shuffle
-mid = (nr_instances/folds)
-start = 0
-0.upto(folds-1) do |i|
-  fork do
-    # split train data
-    puts "Creating fold #{i}"
-    last = start+mid
-    last = last-1 unless nr_instances%folds > i
-    test_idxs = indices[start..last] || []
-    idxs = {
-      :train => indices-test_idxs,
-      :test => test_idxs
-    }
-    p idxs
-    start = last+1
-    # write training/test data
-    cv_dir = File.join(File.dirname(ARGV[0]),"crossvalidation",i.to_s)
-    idxs.each do |t,idx|
-      file = File.join(cv_dir,t.to_s+".csv")
-      `mkdir -p #{File.dirname file}`
-      case t
-      when :train
-        File.open(file,"w+") do |f|
-          f.puts (["Canonical SMILES",model.dependent_variable_name] + model.independent_variable_names).join(",")
-          idx.collect{|i| model.train[i]}.each do |t|
-            f.puts t.join(",")
-          end
-        end
-      when :test
-        File.open(file,"w+") do |f|
-          f.puts (["Canonical SMILES"] + model.independent_variable_names).join(",")
-          idx.collect{|i| model.train[i]}.each do |t|
-            t.delete_at(1)
-            f.puts t.join(",")
-          end
-        end
-      end
-    end
-    Process.waitall
-  end
-end
diff --git a/bin/crossvalidation-predictions.rb b/bin/crossvalidation-predictions.rb
deleted file mode 100755
index 55ae5a1..0000000
--- a/bin/crossvalidation-predictions.rb
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-
-t = Time.now
-Dir["#{File.join(ARGV[0],'[0-9]')}"].each do |fold|
-  fork do
-    puts "Crossvalidation #{fold} started"
-    model = Model.new File.join(fold,"train.csv")
-    model.predict File.join(fold,"test.csv")
-  end
-end
-Process.waitall
-puts "Crossvalidation: #{(Time.now-t)/60} min"
diff --git a/bin/crossvalidation.rb b/bin/crossvalidation.rb
deleted file mode 100755
index b7cfdd7..0000000
--- a/bin/crossvalidation.rb
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-dir = ARGV[0]
-dependent_variable_type = File.read(File.join(dir, "dependent-variable-type")).chomp
-independent_variable_type = File.read(File.join(dir, "independent-variable-type")).chomp
-if dependent_variable_type == "binary" and independent_variable_type == "binary"
-  model = TanimotoClassificationModel.new dir
-elsif dependent_variable_type == "binary" and independent_variable_type == "numeric"
-  model = CosineClassificationModel.new dir
-elsif dependent_variable_type == "numeric" and independent_variable_type == "binary"
-  model = TanimotoRegressionModel.new dir
-elsif dependent_variable_type == "numeric" and independent_variable_type == "numeric"
-  model = CosineRegressionModel.new dir
-end
-model.crossvalidation
diff --git a/bin/export-fingerprints.rb b/bin/export-fingerprints.rb
deleted file mode 100755
index 0e1e934..0000000
--- a/bin/export-fingerprints.rb
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env ruby
-
-dir = ARGV[0]
-fp = []
-endpoint_name = File.read(File.join dir,"dependent_variable_name").chomp
-endpoints = File.open(File.join dir,"dependent_variables").readlines.collect{|v| v.to_i}
-endpoint_values = File.open(File.join dir,"dependent_variable_values").readlines
-smiles = []
-
-File.open(File.join dir,"independent_variables").each_line do |l|
-  l = l.chomp.split(",")
-  smiles << l.shift
-  fp << l
-end
-
-fp_names = fp.flatten.sort.uniq
-header = ["Canonical SMILES"]+fp_names+[endpoint_name]
-puts header.join(",")
-
-(0..smiles.size-1).each do |i|
-  line = [smiles[i]]+fp_names.collect{|n| fp[i].include?(n) ? 1 : 0}+[endpoint_values[endpoints[i]]]
-  puts line.join(",")
-end
diff --git a/bin/fingerprint_independent_variables.rb b/bin/fingerprint_independent_variables.rb
deleted file mode 100755
index 7dea239..0000000
--- a/bin/fingerprint_independent_variables.rb
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-dataset = Dataset.new ARGV[0]
-dataset.fingerprint_independent_variables ARGV[0]
diff --git a/bin/lazar b/bin/lazar
new file mode 100755
index 0000000..e603b4c
--- /dev/null
+++ b/bin/lazar
@@ -0,0 +1,182 @@
+#!/usr/bin/env ruby
+require 'optparse'
+require_relative '../lib/lazar'
+
+ARGV << '-h' if ARGV.empty?
+options = {}
+options[:folds] = 10
+options[:thresholds] = [0.5,0.2]
+
+OptionParser.new do |opts|
+  opts.banner = "Usage: lazar -t TRAIN -x|-p descriptors [options]"
+  opts.on( '-h', '--help', 'Display this screen' ) do
+    puts opts
+    exit
+  end
+  opts.on( '-t TRAIN', '-train TRAIN', "Training data in csv format (required). Type 'lazar -f' for format specifications." ) do |t|
+    options[:train] = t
+  end
+  opts.on( '-p descriptors', '--predict descriptors', "Prediction data in csv format. Type 'lazar -f' for format specifications.") do |p|
+    options[:predict] = p
+  end
+  opts.on( '-x', '--crossvalidation', "Run crossvalidation." ) do |c|
+    options[:cv] = true
+  end
+  opts.on( '-f folds', '--folds folds', Integer, "Change crossvalidation folds (default: #{options[:folds]})." ) do |f|
+    options[:folds] = f
+  end
+  opts.on( '-f', '--formats', "Describe input and output formats" ) do |f|
+    raise OptionParser::InvalidArgument, "Format description not yet implemented."
+  end
+#  opts.on( '-d', '--daemon', "Run as daemon in background" ) do |f|
+#    raise OptionParser::InvalidArgument, "Daemon mode not yet implemented"
+#  end
+end.parse!
+
+raise OptionParser::MissingArgument, "Training data is required. Type 'lazar -h' for help." if options[:train].nil? 
+raise OptionParser::InvalidArgument, "Training data file #{options[:train]} does not exist. Type 'lazar -h' for help." unless File.exists? options[:train]
+raise OptionParser::InvalidOption, "Choose either --predict or --crossvalidation. Type 'lazar -h' for help." if options[:predict] and options[:cv]
+raise OptionParser::InvalidOption, "One of the --predict or --crossvalidation options is required. Type 'lazar -h' for help." unless options[:predict] or options[:cv]
+raise OptionParser::InvalidArgument, "Prediction descriptor file #{options[:predict]} does not exist. Type 'lazar -h' for help." if options[:predict] and !File.exists? options[:predict]
+
+model = Model.new options[:train]
+
+if options[:predict] # batch predictions
+  model.predict options[:predict]
+
+elsif options[:cv] # crossvalidation
+
+  # create folds
+  cv_dir = File.join(File.dirname(options[:train]),"crossvalidation")
+  folds = (0..options[:folds]-1).collect{|i| File.join(cv_dir,i.to_s)}
+  nr_instances = model.train.size
+  indices = (0..nr_instances-1).to_a.shuffle
+  mid = (nr_instances/options[:folds])
+  start = 0
+  0.upto(options[:folds]-1) do |i|
+
+    # split train data
+    puts "Creating fold #{i}"
+    last = start+mid
+    last = last-1 unless nr_instances%options[:folds] > i
+    test_idxs = indices[start..last] || []
+    idxs = {
+      :train => indices-test_idxs,
+      :test => test_idxs
+    }
+    start = last+1
+
+    # write training/test data
+    idxs.each do |t,idx|
+      file = File.join(cv_dir,i.to_s,t.to_s+".csv")
+      `mkdir -p #{File.dirname file}`
+      case t
+      when :train
+        File.open(file,"w+") do |f|
+          f.puts (["Canonical SMILES",model.dependent_variable_name] + model.independent_variable_names).join(",")
+          idx.collect{|i| model.train[i]}.each do |t|
+            f.puts t.join(",")
+          end
+        end
+      when :test
+        File.open(file,"w+") do |f|
+          f.puts (["Canonical SMILES"] + model.independent_variable_names).join(",")
+          idx.collect{|i| model.train[i]}.each do |t|
+            o = t.clone # keep model.train intact
+            o.delete_at(1)
+            f.puts o.join(",")
+          end
+        end
+      end
+    end
+  end
+
+  # crossvalidation predictions
+  t = Time.now
+  folds.each do |fold|
+    fork do
+      puts "Crossvalidation #{fold} started"
+      m = Model.new File.join(fold,"train.csv")
+      m.predict File.join(fold,"test.csv")
+    end
+  end
+  Process.waitall
+  puts "Crossvalidation: #{(Time.now-t)/60} min"
+
+  # crossvalidation summaries
+
+  predictions = []
+  tp=0
+  tn=0
+  fp=0
+  fn=0
+  hc_tp=0
+  hc_tn=0
+  hc_fp=0
+  hc_fn=0
+
+  File.open(File.join(cv_dir,"predictions.csv"),"w+") do |f|
+    folds.each do |fold|
+      pred = File.readlines(File.join(fold,"test-prediction.csv")).collect{|row| row.chomp.split(",")}
+      pred.shift
+      pred.each do |prediction|
+        smi = prediction[0]
+        exp = model.train.select{|t| t[0] == smi}.collect{|t| t[1].to_i}
+        maxsim = prediction[5].to_f
+        v = "NA"
+        unless exp.nil? or prediction[2].empty? or exp.empty?
+          p = prediction[2].to_i
+          exp.each do |e|
+            if p and e
+              if p == 1 and e == 1
+                v = "TP"
+                tp+=1
+                hc_tp+=1 if maxsim > model.minsim.max
+              elsif p == 0 and e == 0
+                v = "TN"
+                tn+=1
+                hc_tn+=1 if maxsim > model.minsim.max
+              elsif p == 1 and e == 0
+                v = "FP"
+                fp+=1
+                hc_fp+=1 if maxsim > model.minsim.max
+              elsif p == 0 and e == 1
+                v = "FN"
+                fn+=1
+                hc_fn+=1 if maxsim > model.minsim.max
+              end
+            end
+            predictions << v
+          end
+        end
+        f.puts([smi,v,maxsim].join(","))
+      end
+    end
+  end
+
+  File.open(File.join(cv_dir,"confusion-matrix-all.csv"),"w+") do |f|
+    f.puts "#{tp},#{fp}\n#{fn},#{tn}"
+  end
+
+  File.open(File.join(cv_dir,"confusion-matrix-high-confidence.csv"),"w+") do |f|
+    f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}"
+  end
+
+  File.open(File.join(cv_dir,"summary-all.csv"),"w+") do |f|
+    f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}"
+    f.puts "true_positive_rate,#{tp/(tp+fn).to_f}"
+    f.puts "true_negative_rate,#{tn/(tn+fp).to_f}"
+    f.puts "positive_predictive_value,#{tp/(tp+fp).to_f}"
+    f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}"
+  end
+
+  File.open(File.join(cv_dir,"summary-high-confidence.csv"),"w+") do |f|
+    f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}"
+    f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}"
+    f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}"
+    f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}"
+    f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}"
+  end
+
+end
+
diff --git a/bin/scale_independent_variables.rb b/bin/scale_independent_variables.rb
deleted file mode 100755
index 1d7662a..0000000
--- a/bin/scale_independent_variables.rb
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env ruby
-require_relative "../lib/lazar"
-dataset = Dataset.new ARGV[0]
-dataset.scale_independent_variables ARGV[0]