diff options
author | Andreas Maunz <andreas@maunz.de> | 2012-01-17 10:10:33 +0100 |
---|---|---|
committer | Andreas Maunz <andreas@maunz.de> | 2012-01-17 10:10:33 +0100 |
commit | 6758b671b7a6efbd7be8bc4fc88e705fb0ee95a0 (patch) | |
tree | a762fdcd9fc62f98663a1d005122bfa5b54ad64d | |
parent | 2df4866db38f64844166e4eaddf956ff59ca46f7 (diff) | |
parent | 627105104a86174794b4edc2f93176bff873ba2b (diff) |
Merge branch 'pc_new_1' of github.com:opentox/test into pc_new_1
-rw-r--r-- | 5x_cv/CV_ds_pctype_prop_algo_rseed.rb | 60 | ||||
-rw-r--r-- | 5x_cv/check_datasets_inchi_equality.rb | 98 | ||||
-rw-r--r-- | 5x_cv/datasets_nestle_outl.yaml | 67 | ||||
-rw-r--r-- | 5x_cv/factors_config | 38 | ||||
-rwxr-xr-x | 5x_cv/wrapper_pc_cv.sh | 45 |
5 files changed, 308 insertions, 0 deletions
diff --git a/5x_cv/CV_ds_pctype_prop_algo_rseed.rb b/5x_cv/CV_ds_pctype_prop_algo_rseed.rb new file mode 100644 index 0000000..b9c55dd --- /dev/null +++ b/5x_cv/CV_ds_pctype_prop_algo_rseed.rb @@ -0,0 +1,60 @@ +# Do a 10-fold crossvalidation +# # Author: Andreas Maunz, David Vorgrimmler +# # @params: Dataset_name(see dataset_nestle.yaml), pc_type(electronic,cpsa or constitutional ...), prop(true or false), prediction_algorithm(local_mlr_prop or local_svm_regression ...) + +if ARGV.size != 5 + puts "Args: ds_name, pc_type, prop, algo, random_seed" + puts ARGV.size + exit +end + +ds_file = "datasets_nestle.yaml" +pwd=`pwd` +path = "#{pwd.chop}/#{ds_file}" +if File.exists?(path) + puts "#{ds_file} exists" +else + puts "#{ds_file} does not exist." + exit +end + +require 'rubygems' +require 'opentox-ruby' +require 'yaml' + +subjectid = nil + +ds_name = ARGV[0] # e.g. MOU +pc_type = ARGV[1] # e.g. electronic,cpsa +prop = ARGV[2] # true or false +algo = ARGV[3] # e.g. local_svm_regression, local_mlr_prop +r_seed = ARGV[4] # 1, 2, ..., 10 + +ds = YAML::load_file("datasets_nestle.yaml") +ds_uri = ds[ds_name]["dataset"] +pc_ds_uri = ds[ds_name][pc_type] + +algo_params = "pc_type=#{pc_type}"; +algo_params += ";feature_dataset_uri=#{pc_ds_uri}" +algo_params += ";propositionalized=#{prop}" +algo_params += ";prediction_algorithm=#{algo}" +puts algo_params.to_yaml + +prediction_feature = OpenTox::Dataset.find(ds_uri).features.keys.first + + +# Ready +cv_args = {} +cv_args[:dataset_uri] = ds_uri +cv_args[:prediction_feature] = prediction_feature +cv_args[:algorithm_uri] = "http://toxcreate3.in-silico.ch:8087/algorithm/lazar" +cv_args[:algorithm_params] = algo_params +cv_args[:stratified] = false +cv_args[:random_seed] = r_seed +puts cv_args.to_yaml + +cv = OpenTox::Crossvalidation.create(cv_args).uri +puts cv + +cvr = OpenTox::CrossvalidationReport.create( cv , subjectid).uri +puts cvr diff --git a/5x_cv/check_datasets_inchi_equality.rb b/5x_cv/check_datasets_inchi_equality.rb new file mode 100644 index 0000000..7d38e96 --- /dev/null +++ b/5x_cv/check_datasets_inchi_equality.rb @@ -0,0 +1,98 @@ +require 'rubygems' +require 'opentox-ruby' +require 'yaml' + +@subjectid = nil + + + +def check_ds(t_ds_uri, f_ds_uri) + puts t_ds_uri + puts f_ds_uri + + regression_training_dataset = OpenTox::Dataset.find(t_ds_uri, @subjectid)#3963;1572; + regression_feature_dataset = OpenTox::Dataset.find(f_ds_uri, @subjectid)#3971;3946; + + train_ds = regression_training_dataset.data_entries.keys + train_cmds = regression_training_dataset.compounds + feature_ds = regression_feature_dataset.data_entries.keys + feature_cmds = regression_feature_dataset.compounds + + puts "----- Check activity inchi -----" + match=0 + mismatch=0 + train_ds.each { |i| + if feature_ds.include?(i) + match = match + 1 + else + mismatch = mismatch + 1 + end + } + if mismatch > 0 + puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0 + puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!" + else + puts "All training compounds represented in feature dataset." unless mismatch > 0 + puts "match: #{match}; mismatch: #{mismatch}" + puts "OK!!!" + end + + train_ds.sort! + feature_ds.sort! + + if train_ds == feature_ds + puts "train_ds == feature_ds" + else + a = train_ds - feature_ds + #puts "d: '#{a}'" + puts "train_ds: " + train_ds.size.to_s + "; feature_ds: "+ feature_ds.size.to_s + puts "train_ds =NOT feature_ds" + end + + + + puts "----- Check compound inchi -----" + match=0 + mismatch=0 + train_cmds.each { |i| + if feature_cmds.include?(i) + match = match + 1 + else + mismatch = mismatch + 1 + end + } + if mismatch > 0 + puts "NOT all training compounds represented in feature dataset!!!" unless mismatch > 0 + puts "match: #{match}; and mismatch: !!!!!#{mismatch}!!!!!" + else + puts "All training compounds represented in feature dataset." unless mismatch > 0 + puts "match: #{match}; mismatch: #{mismatch}" + puts "OK!!!" + end + + + + feature_cmds.sort! + train_cmds.sort! + + if train_cmds == feature_cmds + puts "train_cmds == feature_cmds" + else + b = train_cmds - feature_cmds + #puts "d: '#{b}'" + puts "train_cmds: " + train_cmds.size.to_s + "; feature_cmds: " + feature_cmds.size.to_s + puts "train_cmds =NOT feature_cmds" + end + puts +end + + + + +ds = YAML::load_file("datasets_nestle.yaml") +ds.keys.each { |dataset| + ds[dataset].keys.each { |pc| + puts pc + check_ds(ds[dataset]["dataset"], ds[dataset][pc]) + } +} diff --git a/5x_cv/datasets_nestle_outl.yaml b/5x_cv/datasets_nestle_outl.yaml new file mode 100644 index 0000000..5ef4ea1 --- /dev/null +++ b/5x_cv/datasets_nestle_outl.yaml @@ -0,0 +1,67 @@ +{ + + "MDD": { + "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1571", + "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4007", + "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4006", +# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1905", +# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1906", +# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1907", + "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5265" + }, + + "FHM": { + "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1572", + "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4019", + "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4008" #, +# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1908", +# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1909", +# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1910" + }, + + "RAT": { + "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1573", + "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4077", + "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4020" #, +# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1911", +# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1912", +# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1913" + }, + + "MOU": { + "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/1574", + "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/4115", + "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/4101" #, +# "geometrical": "http://toxcreate3.in-silico.ch:8086/dataset/1914", +# "topological": "http://toxcreate3.in-silico.ch:8086/dataset/1915", +# "hybrid": "http://toxcreate3.in-silico.ch:8086/dataset/1916" + }, + +# "MDD2": { +# "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/", +# "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/", +# "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/", +# "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/" +# }, +# + "FHM2": { + "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/4975", + "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5263", + "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/5262", + "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5266" + }, + + "RAT2": { + "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/4977", + "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5253", + "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/5246", + "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5254" + }, + + "MOU2": { + "dataset": "http://toxcreate3.in-silico.ch:8086/dataset/4976", + "electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5267", + "constitutional": "http://toxcreate3.in-silico.ch:8086/dataset/5264", + "constitutional,electronic,cpsa": "http://toxcreate3.in-silico.ch:8086/dataset/5198" + } +} diff --git a/5x_cv/factors_config b/5x_cv/factors_config new file mode 100644 index 0000000..604ca45 --- /dev/null +++ b/5x_cv/factors_config @@ -0,0 +1,38 @@ +#Dataset pc_type prop prediction_algorithm +MDD constitutional false local_svm_regression +MDD constitutional true local_svm_regression +MDD electronic,cpsa false local_svm_regression +MDD electronic,cpsa true local_svm_regression +MDD constitutional,electronic,cpsa false local_svm_regression +MDD constitutional,electronic,cpsa true local_svm_regression +#MDD constitutional true local_mlr_prop +#MDD electronic,cpsa true local_mlr_prop +#MDD constitutional,electronic,cpsa true local_mlr_prop +FHM2 constitutional false local_svm_regression +FHM2 constitutional true local_svm_regression +FHM2 electronic,cpsa false local_svm_regression +FHM2 electronic,cpsa true local_svm_regression +FHM2 constitutional,electronic,cpsa false local_svm_regression +FHM2 constitutional,electronic,cpsa true local_svm_regression +#MOU constitutional true local_mlr_prop +#MOU electronic,cpsa true local_mlr_prop +#MOU constitutional,electronic,cpsa true local_mlr_prop +RAT2 constitutional false local_svm_regression +RAT2 constitutional true local_svm_regression +RAT2 electronic,cpsa false local_svm_regression +RAT2 electronic,cpsa true local_svm_regression +RAT2 constitutional,electronic,cpsa false local_svm_regression +RAT2 constitutional,electronic,cpsa true local_svm_regression +#MOU constitutional true local_mlr_prop +#MOU electronic,cpsa true local_mlr_prop +#MOU constitutional,electronic,cpsa true local_mlr_prop +MOU2 constitutional false local_svm_regression +MOU2 constitutional true local_svm_regression +MOU2 electronic,cpsa false local_svm_regression +MOU2 electronic,cpsa true local_svm_regression +MOU2 constitutional,electronic,cpsa false local_svm_regression +MOU2 constitutional,electronic,cpsa true local_svm_regression +#MOU constitutional true local_mlr_prop +#MOU electronic,cpsa true local_mlr_prop +#MOU constitutional,electronic,cpsa true local_mlr_prop + diff --git a/5x_cv/wrapper_pc_cv.sh b/5x_cv/wrapper_pc_cv.sh new file mode 100755 index 0000000..719885a --- /dev/null +++ b/5x_cv/wrapper_pc_cv.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Wrapper Skript for CV +# Reads factors_config, dataset_nestle.yaml and performs cv's +# Andreas Maunz, David Vorgrimmler, 2012 + +if [ $# -lt 1 ]; then + echo "Usage: $0 factors" + exit +fi + +PWD=`pwd` +echo $PWD +if [ ! -f $PWD/datasets_nestle.yaml ] +then + echo "datasets_nestle.yaml does not exist." + exit +fi + +# Configure basics +source $HOME/.bash_aliases +otconfig +THIS_DATE=`date +%Y%m%d_%H_` +CV="CV_ds_pctype_prop_algo_rseed.rb" +FACTORS="$1" + +# Don't start when running +while ps x | grep $CV | grep -v grep >/dev/null 2>&1; do sleep 3; done + +LOGFILE="$THIS_DATE""$USER""_wrapper_pc_cv.log" +rm "$LOGFILE" >/dev/null 2>&1 + + +cat $FACTORS | while read factor; do + if ! [[ "$factor" =~ "#" ]]; then # allow comments + for r_seed in 1 #2 3 4 5 + do + factor="$factor $r_seed" + echo "${THIS_DATE}: $factor" >> $LOGFILE>&1 + echo "ruby $CV $factor" >> $LOGFILE 2>&1 + ruby $CV $factor >> $LOGFILE 2>&1 + echo >> $LOGFILE 2>&1 + done + fi +done + |