updated scripts and readme

author: gebele <gebele@in-silico.ch> 2017-03-07 12:04:23 +0000
committer: gebele <gebele@in-silico.ch> 2017-03-07 12:04:23 +0000
commit: 6bd7492c2cc0e7887e99f00ad5f9a8e8fd094392 (patch)
tree: 154266ca1f5dff52c4512daafcec52fbfe1ce790
parent: 61b98e011d4b32e88629d5a07d7b84a6abdded64 (diff)
5 files changed, 178 insertions, 8 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..279b45b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+dump/
diff --git a/README.md b/README.md
index b62e6dd..0e471df 100644
--- a/README.md
+++ b/README.md
@@ -2,5 +2,8 @@
 
 - Public lazar datasets
 - Scripts to generate prediction models and crossvalidations
-
-`mongorestore --host 127.0.0.1` imports the database dump
+- Scripts to generate and compare validation reports
+``` ruby
+  # comment or uncomment for your needs
+  ruby create_prediction_models.rb &
+```
diff --git a/compare_validation_reports.rb b/compare_validation_reports.rb
new file mode 100644
index 0000000..8852a34
--- /dev/null
+++ b/compare_validation_reports.rb
@@ -0,0 +1,104 @@
+#!/usr/bin/env ruby
+require 'optparse'
+require 'json'
+
+
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: compare_validation_reports.rb [options]"
+  
+  opts.on("-d d", "--dir=dir", "Path to the validation reports dir.") do |dir|
+    options[:dir] = (dir[-1,1] == "/" ? dir : dir + "/")
+  end
+
+  opts.on("-c", "--classification", "Select only classification reports from dir.") do |c|
+    if options[:regression]
+      puts "Don't use optional parameters -c and -r at the same time. Mixed by default."
+      exit
+    end
+    options[:classification] = c
+  end
+
+  opts.on("-r", "--regression", "Select only regression reports from dir.") do |r|
+    if options[:classification]
+      puts "Don't use optional parameters -c and -r at the same time. Mixed by default."
+      exit
+    end
+    options[:regression] = r
+  end
+
+  opts.on("-v", "--verbose", "Display verbose report. Standard for -d mode without -c or -r parameters.") do |v|
+    options[:verbose] = v
+  end
+
+  opts.on("-f f", "--file=files", "Select two or more comma seperated reports.") do |files|
+    list = files.split(",")
+    unless list.size == 2
+      puts "You have to pass at least two files as argument with full path."
+      exit
+    end
+    options[:files] = list
+  end
+  
+  opts.on("-h", "--help", "Displays help") do
+    puts opts
+    exit
+  end
+
+end.parse!
+
+if options.empty? || (!options[:files] && !options[:dir])
+  puts "Usage: compare_validation_reports.rb -h"
+  exit
+end
+
+if options[:dir]
+  if options[:verbose]
+    if !options[:classification] && !options[:regression]
+      json = Dir[options[:dir]+'*.json'].map { |f| JSON.parse File.read(f) }.flatten
+      puts JSON.pretty_generate json
+    end
+    if options[:classification]
+      json = Dir[options[:dir]+'*_classification_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+      puts JSON.pretty_generate json
+    end
+    if options[:regression]
+      json = Dir[options[:dir]+'*_regression_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+      puts JSON.pretty_generate json
+    end
+  else
+
+    main = {}
+
+    if !options[:classification] && !options[:regression] && !options[:verbose]
+      json = Dir[options[:dir]+'*.json'].map { |f| JSON.parse File.read(f) }.flatten
+      puts JSON.pretty_generate json
+    end
+    if options[:classification]
+      json = Dir[options[:dir]+'*_classification_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+      json.each do |report|
+        main[report["endpoint"]] ||= []
+        main[report["endpoint"]] << [report["species"], report["created_at"], report["crossvalidations"].map{|cv| {"accuracy": cv[1]["statistics"]["accuracy"], "weighted_accuracy": cv[1]["statistics"]["weighted_accuracy"], "true_rate": cv[1]["statistics"]["true_rate"], "predictivity": cv[1]["statistics"]["predictivity"]}}.flatten]
+      end
+      puts JSON.pretty_generate main
+    end
+    if options[:regression]
+      json = Dir[options[:dir]+'*_regression_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+      json.each do |report|
+        main[report["endpoint"]] ||= []
+        main[report["endpoint"]] << [report["species"], report["created_at"], report["crossvalidations"].map{|cv| {"rmse": cv[1]["statistics"]["rmse"], "r_squared": cv[1]["statistics"]["r_squared"]}}.flatten]
+      end
+      puts JSON.pretty_generate main
+    end
+  end
+end
+
+if options[:files]
+  json = []
+  options[:files].each do |file|
+    json << JSON.parse(File.read(file))
+  end
+  puts JSON.pretty_generate json.flatten
+
+end
+
diff --git a/create_prediction_models.rb b/create_prediction_models.rb
index c653f01..9c3995f 100644
--- a/create_prediction_models.rb
+++ b/create_prediction_models.rb
@@ -2,22 +2,24 @@ ENV["LAZAR_ENV"] = "production"
 require_relative '../lazar/lib/lazar'
 #require 'lazar'
 include OpenTox
-$mongo.database.drop
-$gridfs = $mongo.database.fs # recreate GridFS indexes
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs # recreate GridFS indexes
 
-#=begin
+=begin
 # classification models
 Dir["classification/*csv"].each do |file|
   unless file.match(/hamster/)
     Model::Validation.from_csv_file file
   end
 end
-#=end
+=end
 
 #=begin
 # regression models
 Dir["regression/*log10.csv"].each do |file|
-  Model::Validation.from_csv_file file
+  unless file.match(/fathead/)#until dublicates not cleared
+    Model::Validation.from_csv_file file
+  end
 end
 #=end
 
@@ -62,6 +64,11 @@ feature_categories.each do |category|
 end
 =end
 
-# save
+# save local dump but git ignored
 `mongodump -h 127.0.0.1 -d production`
+
+# build reports and users dump
+eval File.read('./lazar_validation_reports.rb')
+
+# restore
 #`mongorestore --host 127.0.0.1`
diff --git a/lazar_validation_reports.rb b/lazar_validation_reports.rb
new file mode 100644
index 0000000..5621577
--- /dev/null
+++ b/lazar_validation_reports.rb
@@ -0,0 +1,55 @@
+ENV["LAZAR_ENV"] = "production"
+require_relative '../lazar/lib/lazar'
+#require 'lazar'
+require 'json'
+include OpenTox
+
+models = Model::Validation.all
+size = models.size
+puts "#{size} reports to store."
+
+# create dir if not exists
+path = "#{ENV['HOME']}/lazar-validation-reports"
+dir = FileUtils.mkdir_p path
+
+models.each_with_index do |model, idx|
+
+  @json = {}
+  
+  # define file name
+  type = model.regression? ? "regression" : "classification"
+  #name = model.model.name.gsub!(/[^0-9A-Za-z.\-]/, '_')
+  date = model.created_at.to_s.split.first
+  name = (model.endpoint + "_" + model.species).gsub!(/[^0-9A-Za-z.\-]/, '_')
+  branch = model.model.version["branch"]
+  commit = model.model.version["commit"]
+  filename = [date,type,branch,commit,name].join("_")
+
+  # collect object data
+  @json["endpoint"] = model.endpoint
+  @json["species"] = model.species
+  @json["source"] = model.source
+  @json["training_dataset"] = model.training_dataset.source
+  @json["training_compounds"] = model.training_dataset.data_entries.size
+  @json["algorithms"] = model.algorithms
+  @json["name"] = model.model.name
+  @json["created_at"] = model.created_at
+  @json["unit"] = model.unit
+  @json["version"] = model.model.version
+  @json["crossvalidations"] = {} 
+  model.crossvalidations.each_with_index do |cv,idx|
+    @json["crossvalidations"][idx.to_s] = {"folds": cv.folds, "instances": cv.nr_instances, "unpredicted": cv.nr_unpredicted, "statistics": cv.statistics}
+  end
+
+  # write report to file
+  File.open("#{path}/#{filename}.json", "w") do |f|
+    f.write(JSON.pretty_generate(JSON.parse(@json.to_json)))
+  end
+
+  puts "#{size - (idx+1)} left to store."
+
+end
+
+# store database dump
+puts "Storing database dump."
+`mongodump -h 127.0.0.1 -o #{path}/#{Time.now.to_s.split.first}-dump-#{ENV["LAZAR_ENV"]} -d #{ENV["LAZAR_ENV"]}`
author	gebele <gebele@in-silico.ch>	2017-03-07 12:04:23 +0000
committer	gebele <gebele@in-silico.ch>	2017-03-07 12:04:23 +0000
commit	6bd7492c2cc0e7887e99f00ad5f9a8e8fd094392 (patch)
tree	154266ca1f5dff52c4512daafcec52fbfe1ce790
parent	61b98e011d4b32e88629d5a07d7b84a6abdded64 (diff)