summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgebele <gebele@in-silico.ch>2017-03-07 12:04:23 +0000
committergebele <gebele@in-silico.ch>2017-03-07 12:04:23 +0000
commit6bd7492c2cc0e7887e99f00ad5f9a8e8fd094392 (patch)
tree154266ca1f5dff52c4512daafcec52fbfe1ce790
parent61b98e011d4b32e88629d5a07d7b84a6abdded64 (diff)
updated scripts and readme
-rw-r--r--.gitignore1
-rw-r--r--README.md7
-rw-r--r--compare_validation_reports.rb104
-rw-r--r--create_prediction_models.rb19
-rw-r--r--lazar_validation_reports.rb55
5 files changed, 178 insertions, 8 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..279b45b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+dump/
diff --git a/README.md b/README.md
index b62e6dd..0e471df 100644
--- a/README.md
+++ b/README.md
@@ -2,5 +2,8 @@
- Public lazar datasets
- Scripts to generate prediction models and crossvalidations
-
-`mongorestore --host 127.0.0.1` imports the database dump
+- Scripts to generate and compare validation reports
+``` ruby
+ # comment or uncomment for your needs
+ ruby create_prediction_models.rb &
+```
diff --git a/compare_validation_reports.rb b/compare_validation_reports.rb
new file mode 100644
index 0000000..8852a34
--- /dev/null
+++ b/compare_validation_reports.rb
@@ -0,0 +1,104 @@
+#!/usr/bin/env ruby
+require 'optparse'
+require 'json'
+
+
+options = {}
+OptionParser.new do |opts|
+ opts.banner = "Usage: compare_validation_reports.rb [options]"
+
+ opts.on("-d d", "--dir=dir", "Path to the validation reports dir.") do |dir|
+ options[:dir] = (dir[-1,1] == "/" ? dir : dir + "/")
+ end
+
+ opts.on("-c", "--classification", "Select only classification reports from dir.") do |c|
+ if options[:regression]
+ puts "Don't use optional parameters -c and -r at the same time. Mixed by default."
+ exit
+ end
+ options[:classification] = c
+ end
+
+ opts.on("-r", "--regression", "Select only regression reports from dir.") do |r|
+ if options[:classification]
+ puts "Don't use optional parameters -c and -r at the same time. Mixed by default."
+ exit
+ end
+ options[:regression] = r
+ end
+
+ opts.on("-v", "--verbose", "Display verbose report. Standard for -d mode without -c or -r parameters.") do |v|
+ options[:verbose] = v
+ end
+
+ opts.on("-f f", "--file=files", "Select two or more comma seperated reports.") do |files|
+ list = files.split(",")
+ unless list.size == 2
+ puts "You have to pass at least two files as argument with full path."
+ exit
+ end
+ options[:files] = list
+ end
+
+ opts.on("-h", "--help", "Displays help") do
+ puts opts
+ exit
+ end
+
+end.parse!
+
+if options.empty? || (!options[:files] && !options[:dir])
+ puts "Usage: compare_validation_reports.rb -h"
+ exit
+end
+
+if options[:dir]
+ if options[:verbose]
+ if !options[:classification] && !options[:regression]
+ json = Dir[options[:dir]+'*.json'].map { |f| JSON.parse File.read(f) }.flatten
+ puts JSON.pretty_generate json
+ end
+ if options[:classification]
+ json = Dir[options[:dir]+'*_classification_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+ puts JSON.pretty_generate json
+ end
+ if options[:regression]
+ json = Dir[options[:dir]+'*_regression_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+ puts JSON.pretty_generate json
+ end
+ else
+
+ main = {}
+
+ if !options[:classification] && !options[:regression] && !options[:verbose]
+ json = Dir[options[:dir]+'*.json'].map { |f| JSON.parse File.read(f) }.flatten
+ puts JSON.pretty_generate json
+ end
+ if options[:classification]
+ json = Dir[options[:dir]+'*_classification_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+ json.each do |report|
+ main[report["endpoint"]] ||= []
+ main[report["endpoint"]] << [report["species"], report["created_at"], report["crossvalidations"].map{|cv| {"accuracy": cv[1]["statistics"]["accuracy"], "weighted_accuracy": cv[1]["statistics"]["weighted_accuracy"], "true_rate": cv[1]["statistics"]["true_rate"], "predictivity": cv[1]["statistics"]["predictivity"]}}.flatten]
+ end
+ puts JSON.pretty_generate main
+ end
+ if options[:regression]
+ json = Dir[options[:dir]+'*_regression_*.json'].map { |f| JSON.parse File.read(f) }.flatten
+ json.each do |report|
+ main[report["endpoint"]] ||= []
+ main[report["endpoint"]] << [report["species"], report["created_at"], report["crossvalidations"].map{|cv| {"rmse": cv[1]["statistics"]["rmse"], "r_squared": cv[1]["statistics"]["r_squared"]}}.flatten]
+ end
+ puts JSON.pretty_generate main
+ end
+ end
+end
+
+if options[:files]
+ json = []
+ options[:files].each do |file|
+ json << JSON.parse(File.read(file))
+ end
+ puts JSON.pretty_generate json.flatten
+
+end
+
diff --git a/create_prediction_models.rb b/create_prediction_models.rb
index c653f01..9c3995f 100644
--- a/create_prediction_models.rb
+++ b/create_prediction_models.rb
@@ -2,22 +2,24 @@ ENV["LAZAR_ENV"] = "production"
require_relative '../lazar/lib/lazar'
#require 'lazar'
include OpenTox
-$mongo.database.drop
-$gridfs = $mongo.database.fs # recreate GridFS indexes
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs # recreate GridFS indexes
-#=begin
+=begin
# classification models
Dir["classification/*csv"].each do |file|
unless file.match(/hamster/)
Model::Validation.from_csv_file file
end
end
-#=end
+=end
#=begin
# regression models
Dir["regression/*log10.csv"].each do |file|
- Model::Validation.from_csv_file file
+ unless file.match(/fathead/)#until dublicates not cleared
+ Model::Validation.from_csv_file file
+ end
end
#=end
@@ -62,6 +64,11 @@ feature_categories.each do |category|
end
=end
-# save
+# save local dump but git ignored
`mongodump -h 127.0.0.1 -d production`
+
+# build reports and users dump
+eval File.read('./lazar_validation_reports.rb')
+
+# restore
#`mongorestore --host 127.0.0.1`
diff --git a/lazar_validation_reports.rb b/lazar_validation_reports.rb
new file mode 100644
index 0000000..5621577
--- /dev/null
+++ b/lazar_validation_reports.rb
@@ -0,0 +1,55 @@
+ENV["LAZAR_ENV"] = "production"
+require_relative '../lazar/lib/lazar'
+#require 'lazar'
+require 'json'
+include OpenTox
+
+models = Model::Validation.all
+size = models.size
+puts "#{size} reports to store."
+
+# create dir if not exists
+path = "#{ENV['HOME']}/lazar-validation-reports"
+dir = FileUtils.mkdir_p path
+
+models.each_with_index do |model, idx|
+
+ @json = {}
+
+ # define file name
+ type = model.regression? ? "regression" : "classification"
+ #name = model.model.name.gsub!(/[^0-9A-Za-z.\-]/, '_')
+ date = model.created_at.to_s.split.first
+ name = (model.endpoint + "_" + model.species).gsub!(/[^0-9A-Za-z.\-]/, '_')
+ branch = model.model.version["branch"]
+ commit = model.model.version["commit"]
+ filename = [date,type,branch,commit,name].join("_")
+
+ # collect object data
+ @json["endpoint"] = model.endpoint
+ @json["species"] = model.species
+ @json["source"] = model.source
+ @json["training_dataset"] = model.training_dataset.source
+ @json["training_compounds"] = model.training_dataset.data_entries.size
+ @json["algorithms"] = model.algorithms
+ @json["name"] = model.model.name
+ @json["created_at"] = model.created_at
+ @json["unit"] = model.unit
+ @json["version"] = model.model.version
+ @json["crossvalidations"] = {}
+ model.crossvalidations.each_with_index do |cv,idx|
+ @json["crossvalidations"][idx.to_s] = {"folds": cv.folds, "instances": cv.nr_instances, "unpredicted": cv.nr_unpredicted, "statistics": cv.statistics}
+ end
+
+ # write report to file
+ File.open("#{path}/#{filename}.json", "w") do |f|
+ f.write(JSON.pretty_generate(JSON.parse(@json.to_json)))
+ end
+
+ puts "#{size - (idx+1)} left to store."
+
+end
+
+# store database dump
+puts "Storing database dump."
+`mongodump -h 127.0.0.1 -o #{path}/#{Time.now.to_s.split.first}-dump-#{ENV["LAZAR_ENV"]} -d #{ENV["LAZAR_ENV"]}`