summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Maunz <andreas@maunz.de>2012-01-17 10:10:33 +0100
committerAndreas Maunz <andreas@maunz.de>2012-01-17 10:10:33 +0100
commit6758b671b7a6efbd7be8bc4fc88e705fb0ee95a0 (patch)
treea762fdcd9fc62f98663a1d005122bfa5b54ad64d
parent2df4866db38f64844166e4eaddf956ff59ca46f7 (diff)
parent627105104a86174794b4edc2f93176bff873ba2b (diff)
Merge branch 'pc_new_1' of github.com:opentox/test into pc_new_1
-rw-r--r--5x_cv/CV_ds_pctype_prop_algo_rseed.rb60
-rw-r--r--5x_cv/check_datasets_inchi_equality.rb98
-rw-r--r--5x_cv/datasets_nestle_outl.yaml67
-rw-r--r--5x_cv/factors_config38
-rwxr-xr-x5x_cv/wrapper_pc_cv.sh45
5 files changed, 308 insertions, 0 deletions
diff --git a/5x_cv/CV_ds_pctype_prop_algo_rseed.rb b/5x_cv/CV_ds_pctype_prop_algo_rseed.rb
new file mode 100644
index 0000000..b9c55dd
--- /dev/null
+++ b/5x_cv/CV_ds_pctype_prop_algo_rseed.rb
@@ -0,0 +1,60 @@
+# Do a 10-fold crossvalidation
+# # Author: Andreas Maunz, David Vorgrimmler
+# # @params: Dataset_name(see dataset_nestle.yaml), pc_type(electronic,cpsa or constitutional ...), prop(true or false), prediction_algorithm(local_mlr_prop or local_svm_regression ...)
+
+if ARGV.size != 5
+ puts "Args: ds_name, pc_type, prop, algo, random_seed"
+ puts ARGV.size
+ exit
+end
+
+ds_file = "datasets_nestle.yaml"
+pwd=`pwd`
+path = "#{pwd.chop}/#{ds_file}"
+if File.exists?(path)
+ puts "#{ds_file} exists"
+else
+ puts "#{ds_file} does not exist."
+ exit
+end
+
+require 'rubygems'
+require 'opentox-ruby'
+require 'yaml'
+
+subjectid = nil
+
+ds_name = ARGV[0] # e.g. MOU
+pc_type = ARGV[1] # e.g. electronic,cpsa
+prop = ARGV[2] # true or false
+algo = ARGV[3] # e.g. local_svm_regression, local_mlr_prop
+r_seed = ARGV[4] # 1, 2, ..., 10
+
+ds = YAML::load_file("datasets_nestle.yaml")
+ds_uri = ds[ds_name]["dataset"]
+pc_ds_uri = ds[ds_name][pc_type]
+
+algo_params = "pc_type=#{pc_type}";
+algo_params += ";feature_dataset_uri=#{pc_ds_uri}"
+algo_params += ";propositionalized=#{prop}"
+algo_params += ";prediction_algorithm=#{algo}"
+puts algo_params.to_yaml
+
+prediction_feature = OpenTox::Dataset.find(ds_uri).features.keys.first
+
+
+# Ready
+cv_args = {}
+cv_args[:dataset_uri] = ds_uri
+cv_args[:prediction_feature] = prediction_feature
+cv_args[:algorithm_uri] = "http://toxcreate3.in-silico.ch:8087/algorithm/lazar"
+cv_args[:algorithm_params] = algo_params
+cv_args[:stratified] = false
+cv_args[:random_seed] = r_seed
+puts cv_args.to_yaml
+
+cv = OpenTox::Crossvalidation.create(cv_args).uri
+puts cv
+
+cvr = OpenTox::CrossvalidationReport.create( cv , subjectid).uri
+puts cvr
diff --git a/5x_cv/check_datasets_inchi_equality.rb b/5x_cv/check_datasets_inchi_equality.rb
new file mode 100644
index 0000000..7d38e96
--- /dev/null
+++ b/5x_cv/check_datasets_inchi_equality.rb
@@ -0,0 +1,98 @@
+require 'rubygems'
+require 'opentox-ruby'
+require 'yaml'
+
+@subjectid = nil
+
+
+
+def check_ds(t_ds_uri, f_ds_uri)
+ puts t_ds_uri
+ puts f_ds_uri
+
+ regression_training_dataset = OpenTox::Dataset.find(t_ds_uri, @subjectid)#3963;1572;
+ regression_feature_dataset = OpenTox::Dataset.find(f_ds_uri, @subjectid)#3971;3946;
+
+ train_ds = regression_training_dataset.data_entries.keys
+ train_cmds = regression_training_dataset.compounds
+ feature_ds = regression_feature_dataset.data_entries.keys
+ feature_cmds = regression_feature_dataset.compounds
+
+ puts "----- Check activity inchi -----"
+ match=0
+ mismatch=0
+ train_ds.each { |i|
+ if feature_ds.include?(i)
+ match = match + 1
+ else
+ mismatch = mismatch + 1
+ end
+ }
+ if mismatch > 0
+ puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0
+ puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!"
+ else
+ puts "All training compounds represented in feature dataset." unless mismatch > 0
+ puts "match: #{match}; mismatch: #{mismatch}"
+ puts "OK!!!"
+ end
+
+ train_ds.sort!
+ feature_ds.sort!
+
+ if train_ds == feature_ds
+ puts "train_ds == feature_ds"
+ else
+ a = train_ds - feature_ds
+ #puts "d: '#{a}'"
+ puts "train_ds: " + train_ds.size.to_s + "; feature_ds: "+ feature_ds.size.to_s
+ puts "train_ds =NOT feature_ds"
+ end
+
+
+
+ puts "----- Check compound inchi -----"
+ match=0
+ mismatch=0
+ train_cmds.each { |i|
+ if feature_cmds.include?(i)
+ match = match + 1
+ else
+ mismatch = mismatch + 1
+ end
+ }
+ if mismatch > 0
+ puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0
+ puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!"
+ else
+ puts "All training compounds represented in feature dataset." unless mismatch > 0
+ puts "match: #{match}; mismatch: #{mismatch}"
+ puts "OK!!!"
+ end
+
+
+
+ feature_cmds.sort!
+ train_cmds.sort!
+
+ if train_cmds == feature_cmds
+ puts "train_cmds == feature_cmds"
+ else
+ b = train_cmds - feature_cmds
+ #puts "d: '#{b}'"
+ puts "train_cmds: " + train_cmds.size.to_s + "; feature_cmds: " + feature_cmds.size.to_s
+ puts "train_cmds =NOT feature_cmds"
+ end
+ puts
+end
+
+
+
+
+ds = YAML::load_file("datasets_nestle.yaml")
+ds.keys.each { |dataset|
+ ds[dataset].keys.each { |pc|
+ puts pc
+ check_ds(ds[dataset]["dataset"], ds[dataset][pc])
+ }
+}
diff --git a/5x_cv/datasets_nestle_outl.yaml b/5x_cv/datasets_nestle_outl.yaml
new file mode 100644
index 0000000..5ef4ea1
--- /dev/null
+++ b/5x_cv/datasets_nestle_outl.yaml
@@ -0,0 +1,67 @@
+{
+
+ "MDD": {
+ "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1571",
+ "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4007",
+ "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4006",
+# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1905",
+# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1906",
+# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1907",
+ "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5265"
+ },
+
+ "FHM": {
+ "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1572",
+ "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4019",
+ "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4008" #,
+# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1908",
+# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1909",
+# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1910"
+ },
+
+ "RAT": {
+ "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1573",
+ "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4077",
+ "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4020" #,
+# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1911",
+# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1912",
+# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1913"
+ },
+
+ "MOU": {
+ "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1574",
+ "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4115",
+ "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4101" #,
+# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1914",
+# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1915",
+# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1916"
+ },
+
+# "MDD2": {
+# "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/",
+# "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/",
+# "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/",
+# "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/"
+# },
+#
+ "FHM2": {
+ "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/4975",
+ "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5263",
+ "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/5262",
+ "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5266"
+ },
+
+ "RAT2": {
+ "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/4977",
+ "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5253",
+ "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/5246",
+ "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5254"
+ },
+
+ "MOU2": {
+ "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/4976",
+ "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5267",
+ "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/5264",
+ "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5198"
+ }
+}
diff --git a/5x_cv/factors_config b/5x_cv/factors_config
new file mode 100644
index 0000000..604ca45
--- /dev/null
+++ b/5x_cv/factors_config
@@ -0,0 +1,38 @@
+#Dataset pc_type prop prediction_algorithm
+MDD constitutional false local_svm_regression
+MDD constitutional true local_svm_regression
+MDD electronic,cpsa false local_svm_regression
+MDD electronic,cpsa true local_svm_regression
+MDD constitutional,electronic,cpsa false local_svm_regression
+MDD constitutional,electronic,cpsa true local_svm_regression
+#MDD constitutional true local_mlr_prop
+#MDD electronic,cpsa true local_mlr_prop
+#MDD constitutional,electronic,cpsa true local_mlr_prop
+FHM2 constitutional false local_svm_regression
+FHM2 constitutional true local_svm_regression
+FHM2 electronic,cpsa false local_svm_regression
+FHM2 electronic,cpsa true local_svm_regression
+FHM2 constitutional,electronic,cpsa false local_svm_regression
+FHM2 constitutional,electronic,cpsa true local_svm_regression
+#MOU constitutional true local_mlr_prop
+#MOU electronic,cpsa true local_mlr_prop
+#MOU constitutional,electronic,cpsa true local_mlr_prop
+RAT2 constitutional false local_svm_regression
+RAT2 constitutional true local_svm_regression
+RAT2 electronic,cpsa false local_svm_regression
+RAT2 electronic,cpsa true local_svm_regression
+RAT2 constitutional,electronic,cpsa false local_svm_regression
+RAT2 constitutional,electronic,cpsa true local_svm_regression
+#MOU constitutional true local_mlr_prop
+#MOU electronic,cpsa true local_mlr_prop
+#MOU constitutional,electronic,cpsa true local_mlr_prop
+MOU2 constitutional false local_svm_regression
+MOU2 constitutional true local_svm_regression
+MOU2 electronic,cpsa false local_svm_regression
+MOU2 electronic,cpsa true local_svm_regression
+MOU2 constitutional,electronic,cpsa false local_svm_regression
+MOU2 constitutional,electronic,cpsa true local_svm_regression
+#MOU constitutional true local_mlr_prop
+#MOU electronic,cpsa true local_mlr_prop
+#MOU constitutional,electronic,cpsa true local_mlr_prop
+
diff --git a/5x_cv/wrapper_pc_cv.sh b/5x_cv/wrapper_pc_cv.sh
new file mode 100755
index 0000000..719885a
--- /dev/null
+++ b/5x_cv/wrapper_pc_cv.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Wrapper Skript for CV
+# Reads factors_config, dataset_nestle.yaml and performs cv's
+# Andreas Maunz, David Vorgrimmler, 2012
+
+if [ $# -lt 1 ]; then
+ echo "Usage: $0 factors"
+ exit
+fi
+
+PWD=`pwd`
+echo $PWD
+if [ ! -f $PWD/datasets_nestle.yaml ]
+then
+ echo "datasets_nestle.yaml does not exist."
+ exit
+fi
+
+# Configure basics
+source $HOME/.bash_aliases
+otconfig
+THIS_DATE=`date +%Y%m%d_%H_`
+CV="CV_ds_pctype_prop_algo_rseed.rb"
+FACTORS="$1"
+
+# Don't start when running
+while ps x | grep $CV | grep -v grep >/dev/null 2>&1; do sleep 3; done
+
+LOGFILE="$THIS_DATE""$USER""_wrapper_pc_cv.log"
+rm "$LOGFILE" >/dev/null 2>&1
+
+
+cat $FACTORS | while read factor; do
+ if ! [[ "$factor" =~ "#" ]]; then # allow comments
+ for r_seed in 1 #2 3 4 5
+ do
+ factor="$factor $r_seed"
+ echo "${THIS_DATE}: $factor" >> $LOGFILE>&1
+ echo "ruby $CV $factor" >> $LOGFILE 2>&1
+ ruby $CV $factor >> $LOGFILE 2>&1
+ echo >> $LOGFILE 2>&1
+ done
+ fi
+done
+