summaryrefslogtreecommitdiff
path: root/5x_cv
diff options
context:
space:
mode:
Diffstat (limited to '5x_cv')
-rw-r--r--5x_cv/5x_crossvalidation.rb58
-rw-r--r--5x_cv/check_datasets_inchi_equality.rb98
-rwxr-xr-x5x_cv/comparealgs_dv.sh14
-rw-r--r--5x_cv/dataset_config13
-rw-r--r--5x_cv/exceptions_config.yaml6
-rw-r--r--5x_cv/factors_config1
-rw-r--r--5x_cv/get_csv_versions.rb3
-rw-r--r--5x_cv/get_csv_versions.sh20
-rw-r--r--5x_cv/lib/cv_am.rb161
-rwxr-xr-x5x_cv/wrapper5cv.sh38
10 files changed, 412 insertions, 0 deletions
diff --git a/5x_cv/5x_crossvalidation.rb b/5x_cv/5x_crossvalidation.rb
new file mode 100644
index 0000000..bdde2dc
--- /dev/null
+++ b/5x_cv/5x_crossvalidation.rb
@@ -0,0 +1,58 @@
+# Do a five times 10-fold crossvalidation
+# # Author: Andreas Maunz, David Vorgrimmler
+# # @params: CSV-File, Method (LAST, BBRC), Minimum Frequency
+
+require 'rubygems'
+require 'opentox-ruby'
+require 'lib/cv_am.rb'
+
+subjectid = nil
+
+if ARGV.size != 1
+ puts
+ puts "Error! Arguments: <algorithm_params> in the form p1=v1;p2=v2;...;pn=vn"
+ exit 1
+end
+
+# Arguments for lib/cv.rb: file_or_dataset_uri feature_generation min_frequency min_chisq_significance backbone stratified random_seed prediction_algorithm local_svm_kernel nr_hits conf_stdev
+position_mapper={
+ "dataset_uri" => 0,
+ "feature_generation_uri" => 1,
+ "min_frequency" => 2,
+ "min_chisq_significance" => 3,
+ "backbone" => 4,
+ "stratified" => 5,
+ "random_seed" => 6,
+ "prediction_algorithm" => 7,
+ "local_svm_kernel" => 8,
+ "nr_hits" => 9,
+ "conf_stdev" => 10
+}
+
+param_str=$ARGV[0]
+puts param_str
+params = Array.new(position_mapper.size,"")
+param_str.split(";").each { |param|
+ k,v = param.split("=")
+ params[position_mapper[k]] = v
+}
+params[5]="false" # stratified
+
+exception_config = YAML.load_file("exceptions_config.yaml")
+if ! exception_config[params[0]].nil?
+ exception_config[params[0]].each { |k,v|
+ puts "Setting exception: #{k} => #{v}"
+ params[position_mapper[k]] = v
+ }
+end
+
+for i in 1..5
+ begin
+ puts
+ puts "Round #{i.to_s}."
+ params[6]=i # random seed
+ cv(params)
+ rescue Exception => e
+ puts "Error in 5xCV: #{e.message}: #{e.backtrace}"
+ end
+end
diff --git a/5x_cv/check_datasets_inchi_equality.rb b/5x_cv/check_datasets_inchi_equality.rb
new file mode 100644
index 0000000..7d38e96
--- /dev/null
+++ b/5x_cv/check_datasets_inchi_equality.rb
@@ -0,0 +1,98 @@
+require 'rubygems'
+require 'opentox-ruby'
+require 'yaml'
+
+@subjectid = nil
+
+
+
+def check_ds(t_ds_uri, f_ds_uri)
+ puts t_ds_uri
+ puts f_ds_uri
+
+ regression_training_dataset = OpenTox::Dataset.find(t_ds_uri, @subjectid)#3963;1572;
+ regression_feature_dataset = OpenTox::Dataset.find(f_ds_uri, @subjectid)#3971;3946;
+
+ train_ds = regression_training_dataset.data_entries.keys
+ train_cmds = regression_training_dataset.compounds
+ feature_ds = regression_feature_dataset.data_entries.keys
+ feature_cmds = regression_feature_dataset.compounds
+
+ puts "----- Check activity inchi -----"
+ match=0
+ mismatch=0
+ train_ds.each { |i|
+ if feature_ds.include?(i)
+ match = match + 1
+ else
+ mismatch = mismatch + 1
+ end
+ }
+ if mismatch > 0
+ puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0
+ puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!"
+ else
+ puts "All training compounds represented in feature dataset." unless mismatch > 0
+ puts "match: #{match}; mismatch: #{mismatch}"
+ puts "OK!!!"
+ end
+
+ train_ds.sort!
+ feature_ds.sort!
+
+ if train_ds == feature_ds
+ puts "train_ds == feature_ds"
+ else
+ a = train_ds - feature_ds
+ #puts "d: '#{a}'"
+ puts "train_ds: " + train_ds.size.to_s + "; feature_ds: "+ feature_ds.size.to_s
+ puts "train_ds =NOT feature_ds"
+ end
+
+
+
+ puts "----- Check compound inchi -----"
+ match=0
+ mismatch=0
+ train_cmds.each { |i|
+ if feature_cmds.include?(i)
+ match = match + 1
+ else
+ mismatch = mismatch + 1
+ end
+ }
+ if mismatch > 0
+ puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0
+ puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!"
+ else
+ puts "All training compounds represented in feature dataset." unless mismatch > 0
+ puts "match: #{match}; mismatch: #{mismatch}"
+ puts "OK!!!"
+ end
+
+
+
+ feature_cmds.sort!
+ train_cmds.sort!
+
+ if train_cmds == feature_cmds
+ puts "train_cmds == feature_cmds"
+ else
+ b = train_cmds - feature_cmds
+ #puts "d: '#{b}'"
+ puts "train_cmds: " + train_cmds.size.to_s + "; feature_cmds: " + feature_cmds.size.to_s
+ puts "train_cmds =NOT feature_cmds"
+ end
+ puts
+end
+
+
+
+
+ds = YAML::load_file("datasets_nestle.yaml")
+ds.keys.each { |dataset|
+ ds[dataset].keys.each { |pc|
+ puts pc
+ check_ds(ds[dataset]["dataset"], ds[dataset][pc])
+ }
+}
diff --git a/5x_cv/comparealgs_dv.sh b/5x_cv/comparealgs_dv.sh
new file mode 100755
index 0000000..1b7a7b4
--- /dev/null
+++ b/5x_cv/comparealgs_dv.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ $# -ne 4 ]; then
+ echo "\"validation_uri1,validation_uri2,...\" \"identifier1,identifier2,...\" \"significance [0.95-0.6]\" \"attributes: weighted_r_square,weighted_root_mean_squared_error,weighted_mean_absolute_error,r_square,root_mean_squared_error,sample_correlation_coefficient\""
+ exit 1
+fi
+
+uris="$1"
+iden="$2"
+signi="$3" #default 0.9; 0.95 - 0.6
+attri="$4" #weighted_r_square,weighted_root_mean_squared_error,weighted_mean_absolute_error,r_square,root_mean_squared_error,sample_correlation_coefficient
+host="toxcreate3.in-silico.ch:8080"
+
+curl -X POST -d "validation_uris=$uris" -d "identifier=$iden" -d "ttest_significance=$signi" -d "ttest_attributes=$attri" http://$host/validation/report/algorithm_comparison
diff --git a/5x_cv/dataset_config b/5x_cv/dataset_config
new file mode 100644
index 0000000..a52649c
--- /dev/null
+++ b/5x_cv/dataset_config
@@ -0,0 +1,13 @@
+#EPA v4b Fathead Minnow Acute Toxicity LC50_mmol
+http://toxcreate3.in-silico.ch:8080/dataset/2133
+#CPDBAS_v5d_20Nov2008_rat_TD50
+#http://toxcreate3.in-silico.ch:8080/dataset/1408
+#CPDBAS_v5d_20Nov2008_mouse_TD50
+#http://toxcreate3.in-silico.ch:8080/dataset/1384
+#MultiCellCall: DSSTox Carcinogenic Potency DBS MultiCellCall_no_duplicates.csv
+#http://toxcreate3.in-silico.ch:8080/dataset/130
+#Bloodbarr: bloodbarr_no_duplicate.csv
+#http://toxcreate3.in-silico.ch:8080/dataset/271
+#Salmonella Mutagenicity: DSSTox Carcinogenic Potency DBS Mutagenicity_no_duplicates.csv
+#http://toxcreate3.in-silico.ch:8080/dataset/233
+
diff --git a/5x_cv/exceptions_config.yaml b/5x_cv/exceptions_config.yaml
new file mode 100644
index 0000000..7124c62
--- /dev/null
+++ b/5x_cv/exceptions_config.yaml
@@ -0,0 +1,6 @@
+http://toxcreate3.in-silico.ch:8080/dataset/271:
+ min_frequency: 12
+http://x61s.fdm.uni-freiburg.de/dataset/3546:
+ min_frequency: 8
+http://x61s.fdm.uni-freiburg.de/dataset/3543:
+ min_frequency: 6
diff --git a/5x_cv/factors_config b/5x_cv/factors_config
new file mode 100644
index 0000000..72dbb5f
--- /dev/null
+++ b/5x_cv/factors_config
@@ -0,0 +1 @@
+feature_generation_uri=http://toxcreate3.in-silico.ch:8080/algorithm/fminer/bbrc
diff --git a/5x_cv/get_csv_versions.rb b/5x_cv/get_csv_versions.rb
new file mode 100644
index 0000000..c09a46e
--- /dev/null
+++ b/5x_cv/get_csv_versions.rb
@@ -0,0 +1,3 @@
+require 'yaml'
+ds = YAML::load_file("datasets_nestle.yaml")
+ds.keys.each { |d| puts d ; ds[d].keys.each {|t| puts " #{t}"; cmd = " curl -H 'accept:text/csv' #{ds[d][t]} > csv_file; mv -v --backup=numbered csv_file #{d}_#{t.gsub(/,/, '_')}.csv" unless t=="dataset"; puts cmd } }
diff --git a/5x_cv/get_csv_versions.sh b/5x_cv/get_csv_versions.sh
new file mode 100644
index 0000000..f3a29eb
--- /dev/null
+++ b/5x_cv/get_csv_versions.sh
@@ -0,0 +1,20 @@
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2902 > csv_file; mv -v --backup=numbered csv_file FHM_electronic_cpsa.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1908 > csv_file; mv -v --backup=numbered csv_file FHM_geometrical.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1909 > csv_file; mv -v --backup=numbered csv_file FHM_topological.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1910 > csv_file; mv -v --backup=numbered csv_file FHM_hybrid.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2903 > csv_file; mv -v --backup=numbered csv_file FHM_constitutional.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2900 > csv_file; mv -v --backup=numbered csv_file MDD_electronic_cpsa.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1905 > csv_file; mv -v --backup=numbered csv_file MDD_geometrical.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1906 > csv_file; mv -v --backup=numbered csv_file MDD_topological.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1907 > csv_file; mv -v --backup=numbered csv_file MDD_hybrid.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2901 > csv_file; mv -v --backup=numbered csv_file MDD_constitutional.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2904 > csv_file; mv -v --backup=numbered csv_file RAT_electronic_cpsa.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1911 > csv_file; mv -v --backup=numbered csv_file RAT_geometrical.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1912 > csv_file; mv -v --backup=numbered csv_file RAT_topological.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1913 > csv_file; mv -v --backup=numbered csv_file RAT_hybrid.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2905 > csv_file; mv -v --backup=numbered csv_file RAT_constitutional.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2906 > csv_file; mv -v --backup=numbered csv_file MOU_electronic_cpsa.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1914 > csv_file; mv -v --backup=numbered csv_file MOU_geometrical.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1915 > csv_file; mv -v --backup=numbered csv_file MOU_topological.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/1916 > csv_file; mv -v --backup=numbered csv_file MOU_hybrid.csv
+ curl -H 'accept:text/csv' http://toxcreate3.in-silico.ch:8086/dataset/2907 > csv_file; mv -v --backup=numbered csv_file MOU_constitutional.csv
diff --git a/5x_cv/lib/cv_am.rb b/5x_cv/lib/cv_am.rb
new file mode 100644
index 0000000..965cd5b
--- /dev/null
+++ b/5x_cv/lib/cv_am.rb
@@ -0,0 +1,161 @@
+# Do a 10-fold crossvalidation with mutiple datasets
+# Author: Andreas Maunz, David Vorgrimmler
+# @params: CSV-File, Method (LAST, BBRC), Minimum Frequency
+
+def cv (args)
+
+ subjectid = nil#OpenTox::Authorization.authenticate(guest,guest)
+
+ if args.size != 11
+ puts
+ puts "Error! Arguments: file_or_dataset_uri feature_generation min_frequency min_chisq_significance backbone stratified random_seed prediction_algorithm local_svm_kernel nr_hits conf_stdev"
+ exit 1
+ end
+
+ reg=/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/ix
+
+ file=args[0]
+
+
+ # dataset_is_uri=false
+ # if reg.match(file)? true : false
+ # #file.include? "http"
+ # puts "Uri is valid"
+ dataset_is_uri=true
+# files = [ file ]
+ # elsif ! File.exists? file
+ # puts "File #{file} missing"
+ # exit 1
+ # end
+
+# if args[1].to_s != "last" && args[1].to_s != "bbrc"
+ if !(args[1].to_s.include? "/algorithm/fminer/bbrc") && !(args[1].to_s.include? "/algorithm/fminer/last")
+ puts "feature_generation_uri must contain '/algorithm/fminer/last' or '/algorithm/fminer/bbrc'"
+# puts "feature_generation must be 'last' or 'bbrc'"
+ exit 1
+ end
+
+ if ! args[2] == ""
+ if args[2].to_i < 2
+ puts "min_frequency must be at least 2 or \"\""
+ exit 1
+ end
+ end
+
+ if ! args[3] == ""
+ if ! (args[3].to_f <= 1.0 && args[3].to_f >= 0.0)
+ puts "min_chisq_significance must be between 0 and 1 or \"\""
+ exit 1
+ end
+ end
+
+ if ! args[4] == ""
+ if args[4].to_s != "true" && args[4].to_s != "false"
+ puts "backbone must be 'true' or 'false'."
+ exit 1
+ end
+ end
+
+
+ if args[5].to_s != "true" && args[5].to_s != "false"
+ puts "stratified must be 'true' or 'false'"
+ exit 1
+ end
+
+ if ! args[6] == ""
+ if ! (args[6].to_i <= 1)
+ puts "random_seed must be a natural number or \"\""
+ exit 1
+ end
+ end
+
+ if ! args[7] == ""
+ if ! (args[7] == "local_svm_classification")
+ puts "lazar_prediction_method must be \"local_svm_classification\""
+ exit 1
+ end
+ end
+
+ if ! args[8] == ""
+ if ! (args[8] == "weighted_tanimoto" || args[8] == "propositionalized")
+ puts "local_svm_kernel must be \"weighted_tanimoto\" or \"propositionalized\""
+ exit 1
+ end
+ end
+
+ if ! args[9] == ""
+ if ! (args[9] == "true")
+ puts "nr_hits must be \"true\""
+ exit 1
+ end
+ end
+
+ if ! args[10] == ""
+ if ! (args[10] == "true")
+ puts "conf_stdev must be \"true\""
+ exit 1
+ end
+ end
+
+
+
+ #if !dataset_is_uri
+ # # Upload a dataset
+ # training_dataset = OpenTox::Dataset.create_from_csv_file(file, subjectid)
+ # prediction_feature = training_dataset.features.keys[0]
+ # training_dataset_uri=training_dataset.uri
+ # puts prediction_feature
+ #else
+ training_dataset_uri=file
+ puts training_dataset_uri
+ prediction_feature = OpenTox::Dataset.find(training_dataset_uri).features.keys.first
+ puts prediction_feature
+ # end
+ puts training_dataset_uri
+
+
+ # Crossvalidation
+ # @param [Hash] params (required:algorithm_uri,dataset_uri,prediction_feature, optional:algorithm_params,num_folds(10),random_seed(1),stratified(false))
+ alg_params = "feature_generation_uri=#{args[1]}";
+ alg_params = alg_params << ";min_frequency=#{args[2]}" unless args[2]==""
+ alg_params = alg_params << ";min_chisq_significance=#{args[3]}" unless args[3]==""
+ alg_params = alg_params << ";backbone=#{args[4]}" unless args[4]==""
+ alg_params = alg_params << ";prediction_algorithm=#{args[7]}" unless args[7]==""
+ alg_params = alg_params << ";local_svm_kernel=#{args[8]}" unless args[8]==""
+ alg_params = alg_params << ";nr_hits=#{args[9]}" unless args[9]==""
+ alg_params = alg_params << ";conf_stdev=#{args[10]}" unless args[10]==""
+
+ stratified_param = args[5]
+ random_seed_param = args[6]
+
+ cv_args = {:dataset_uri => training_dataset_uri, :prediction_feature => prediction_feature, :algorithm_uri => args[1].split('fminer')[0] + "lazar", :algorithm_params => alg_params, :stratified => stratified_param }
+ cv_args[:random_seed] = random_seed_param unless random_seed_param == ""
+ puts file
+ puts cv_args.to_yaml
+ puts
+ begin
+ lazar_single_args = {}
+ lazar_single_args[:feature_generation_uri] = "#{args[1]}";
+ lazar_single_args[:min_frequency] = args[2] unless args[2]==""
+ lazar_single_args[:min_chisq_significance] = args[3] unless args[3]==""
+ lazar_single_args[:backbone] = args[4] unless args[4]==""
+ lazar_single_args[:prediction_algorithm] = args[7] unless args[7]==""
+ lazar_single_args[:local_svm_kernel] = args[8] unless args[8]==""
+ lazar_single_args[:nr_hits] = args[9] unless args[9]==""
+ lazar_single_args[:conf_stdev] = args[10] unless args[10]==""
+ #m = OpenTox::Algorithm::Lazar.new.run({:dataset_uri => training_dataset_uri, :subjectid => subjectid}.merge lazar_single_args ).to_s
+ #puts m
+ cv = OpenTox::Crossvalidation.create(cv_args).uri
+ puts cv
+ cvr = OpenTox::CrossvalidationReport.create( cv , subjectid).uri
+ puts cvr
+ #qmrfr = OpenTox::QMRFReport.create(m).uri
+ #puts qmrfr
+ #cv_stat = OpenTox::Validation.from_cv_statistics( cv, subjectid )
+ #puts cv_stat.metadata.to_yaml
+ #[ cv_stat, training_dataset_uri ]
+ rescue Exception => e
+ puts "cv failed: #{e.message} #{e.backtrace}"
+ end
+
+end
diff --git a/5x_cv/wrapper5cv.sh b/5x_cv/wrapper5cv.sh
new file mode 100755
index 0000000..2155635
--- /dev/null
+++ b/5x_cv/wrapper5cv.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Wrapper Skript for CV
+# Set Factors, Datasets, Exceptions in the respective config_files
+# AM, 2011
+
+if [ $# -lt 2 ]; then
+ echo "Usage: $0 factors datasets"
+ exit
+fi
+
+# Configure basics
+source $HOME/.bash_aliases
+otconfig
+THIS_DATE=`date +%Y%m%d_%H_`
+FACTORS="$1"
+DATASETS="$2"
+
+# Don't start when running
+while ps x | grep 5x | grep -v grep >/dev/null 2>&1; do sleep 3; done
+
+LOGFILE="$THIS_DATE""$USER""_wrapper5cv.log"
+rm "$LOGFILE" >/dev/null 2>&1
+
+cat $DATASETS | while read dataset_uri; do
+ if ! [[ "$dataset_uri" =~ "#" ]]; then # allow comments
+ cat $FACTORS | while read factor; do
+ if ! [[ "$factor" =~ "#" ]]; then # allow comments
+ echo "${THIS_DATE}: $factor" >> $LOGFILE>&1
+ factor="$factor;dataset_uri=$dataset_uri"
+ echo "ruby 5x_crossvalidation.rb $factor" >> $LOGFILE 2>&1
+ ruby 5x_crossvalidation.rb $factor >> $LOGFILE 2>&1
+ fi
+ done
+ else
+ echo >> $LOGFILE 2>&1
+ echo $dataset_uri >> $LOGFILE 2>&1
+ fi
+done