From 20cff56d93b10962ee21577d0a25fc7b5e6907b5 Mon Sep 17 00:00:00 2001 From: davor Date: Fri, 25 Nov 2011 08:56:25 +0100 Subject: Added scripts for automated crossvalidations --- 5x_cv/5x_crossvalidation.rb | 58 ++++++++++++++++ 5x_cv/comparealgs_dv.sh | 14 ++++ 5x_cv/dataset_config | 13 ++++ 5x_cv/exceptions_config.yaml | 6 ++ 5x_cv/factors_config | 1 + 5x_cv/lib/cv_am.rb | 161 +++++++++++++++++++++++++++++++++++++++++++ 5x_cv/wrapper5cv.sh | 38 ++++++++++ 7 files changed, 291 insertions(+) create mode 100644 5x_cv/5x_crossvalidation.rb create mode 100755 5x_cv/comparealgs_dv.sh create mode 100644 5x_cv/dataset_config create mode 100644 5x_cv/exceptions_config.yaml create mode 100644 5x_cv/factors_config create mode 100644 5x_cv/lib/cv_am.rb create mode 100755 5x_cv/wrapper5cv.sh diff --git a/5x_cv/5x_crossvalidation.rb b/5x_cv/5x_crossvalidation.rb new file mode 100644 index 0000000..bdde2dc --- /dev/null +++ b/5x_cv/5x_crossvalidation.rb @@ -0,0 +1,58 @@ +# Do a five times 10-fold crossvalidation +# # Author: Andreas Maunz, David Vorgrimmler +# # @params: CSV-File, Method (LAST, BBRC), Minimum Frequency + +require 'rubygems' +require 'opentox-ruby' +require 'lib/cv_am.rb' + +subjectid = nil + +if ARGV.size != 1 + puts + puts "Error! Arguments: in the form p1=v1;p2=v2;...;pn=vn" + exit 1 +end + +# Arguments for lib/cv.rb: file_or_dataset_uri feature_generation min_frequency min_chisq_significance backbone stratified random_seed prediction_algorithm local_svm_kernel nr_hits conf_stdev +position_mapper={ + "dataset_uri" => 0, + "feature_generation_uri" => 1, + "min_frequency" => 2, + "min_chisq_significance" => 3, + "backbone" => 4, + "stratified" => 5, + "random_seed" => 6, + "prediction_algorithm" => 7, + "local_svm_kernel" => 8, + "nr_hits" => 9, + "conf_stdev" => 10 +} + +param_str=$ARGV[0] +puts param_str +params = Array.new(position_mapper.size,"") +param_str.split(";").each { |param| + k,v = param.split("=") + params[position_mapper[k]] = v +} +params[5]="false" # stratified + +exception_config = YAML.load_file("exceptions_config.yaml") +if ! exception_config[params[0]].nil? + exception_config[params[0]].each { |k,v| + puts "Setting exception: #{k} => #{v}" + params[position_mapper[k]] = v + } +end + +for i in 1..5 + begin + puts + puts "Round #{i.to_s}." + params[6]=i # random seed + cv(params) + rescue Exception => e + puts "Error in 5xCV: #{e.message}: #{e.backtrace}" + end +end diff --git a/5x_cv/comparealgs_dv.sh b/5x_cv/comparealgs_dv.sh new file mode 100755 index 0000000..1b7a7b4 --- /dev/null +++ b/5x_cv/comparealgs_dv.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +if [ $# -ne 4 ]; then + echo "\"validation_uri1,validation_uri2,...\" \"identifier1,identifier2,...\" \"significance [0.95-0.6]\" \"attributes: weighted_r_square,weighted_root_mean_squared_error,weighted_mean_absolute_error,r_square,root_mean_squared_error,sample_correlation_coefficient\"" + exit 1 +fi + +uris="$1" +iden="$2" +signi="$3" #default 0.9; 0.95 - 0.6 +attri="$4" #weighted_r_square,weighted_root_mean_squared_error,weighted_mean_absolute_error,r_square,root_mean_squared_error,sample_correlation_coefficient +host="toxcreate3.in-silico.ch:8080" + +curl -X POST -d "validation_uris=$uris" -d "identifier=$iden" -d "ttest_significance=$signi" -d "ttest_attributes=$attri" http://$host/validation/report/algorithm_comparison diff --git a/5x_cv/dataset_config b/5x_cv/dataset_config new file mode 100644 index 0000000..a52649c --- /dev/null +++ b/5x_cv/dataset_config @@ -0,0 +1,13 @@ +#EPA v4b Fathead Minnow Acute Toxicity LC50_mmol +http://toxcreate3.in-silico.ch:8080/dataset/2133 +#CPDBAS_v5d_20Nov2008_rat_TD50 +#http://toxcreate3.in-silico.ch:8080/dataset/1408 +#CPDBAS_v5d_20Nov2008_mouse_TD50 +#http://toxcreate3.in-silico.ch:8080/dataset/1384 +#MultiCellCall: DSSTox Carcinogenic Potency DBS MultiCellCall_no_duplicates.csv +#http://toxcreate3.in-silico.ch:8080/dataset/130 +#Bloodbarr: bloodbarr_no_duplicate.csv +#http://toxcreate3.in-silico.ch:8080/dataset/271 +#Salmonella Mutagenicity: DSSTox Carcinogenic Potency DBS Mutagenicity_no_duplicates.csv +#http://toxcreate3.in-silico.ch:8080/dataset/233 + diff --git a/5x_cv/exceptions_config.yaml b/5x_cv/exceptions_config.yaml new file mode 100644 index 0000000..7124c62 --- /dev/null +++ b/5x_cv/exceptions_config.yaml @@ -0,0 +1,6 @@ +http://toxcreate3.in-silico.ch:8080/dataset/271: + min_frequency: 12 +http://x61s.fdm.uni-freiburg.de/dataset/3546: + min_frequency: 8 +http://x61s.fdm.uni-freiburg.de/dataset/3543: + min_frequency: 6 diff --git a/5x_cv/factors_config b/5x_cv/factors_config new file mode 100644 index 0000000..72dbb5f --- /dev/null +++ b/5x_cv/factors_config @@ -0,0 +1 @@ +feature_generation_uri=http://toxcreate3.in-silico.ch:8080/algorithm/fminer/bbrc diff --git a/5x_cv/lib/cv_am.rb b/5x_cv/lib/cv_am.rb new file mode 100644 index 0000000..965cd5b --- /dev/null +++ b/5x_cv/lib/cv_am.rb @@ -0,0 +1,161 @@ +# Do a 10-fold crossvalidation with mutiple datasets +# Author: Andreas Maunz, David Vorgrimmler +# @params: CSV-File, Method (LAST, BBRC), Minimum Frequency + +def cv (args) + + subjectid = nil#OpenTox::Authorization.authenticate(guest,guest) + + if args.size != 11 + puts + puts "Error! Arguments: file_or_dataset_uri feature_generation min_frequency min_chisq_significance backbone stratified random_seed prediction_algorithm local_svm_kernel nr_hits conf_stdev" + exit 1 + end + + reg=/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/ix + + file=args[0] + + + # dataset_is_uri=false + # if reg.match(file)? true : false + # #file.include? "http" + # puts "Uri is valid" + dataset_is_uri=true +# files = [ file ] + # elsif ! File.exists? file + # puts "File #{file} missing" + # exit 1 + # end + +# if args[1].to_s != "last" && args[1].to_s != "bbrc" + if !(args[1].to_s.include? "/algorithm/fminer/bbrc") && !(args[1].to_s.include? "/algorithm/fminer/last") + puts "feature_generation_uri must contain '/algorithm/fminer/last' or '/algorithm/fminer/bbrc'" +# puts "feature_generation must be 'last' or 'bbrc'" + exit 1 + end + + if ! args[2] == "" + if args[2].to_i < 2 + puts "min_frequency must be at least 2 or \"\"" + exit 1 + end + end + + if ! args[3] == "" + if ! (args[3].to_f <= 1.0 && args[3].to_f >= 0.0) + puts "min_chisq_significance must be between 0 and 1 or \"\"" + exit 1 + end + end + + if ! args[4] == "" + if args[4].to_s != "true" && args[4].to_s != "false" + puts "backbone must be 'true' or 'false'." + exit 1 + end + end + + + if args[5].to_s != "true" && args[5].to_s != "false" + puts "stratified must be 'true' or 'false'" + exit 1 + end + + if ! args[6] == "" + if ! (args[6].to_i <= 1) + puts "random_seed must be a natural number or \"\"" + exit 1 + end + end + + if ! args[7] == "" + if ! (args[7] == "local_svm_classification") + puts "lazar_prediction_method must be \"local_svm_classification\"" + exit 1 + end + end + + if ! args[8] == "" + if ! (args[8] == "weighted_tanimoto" || args[8] == "propositionalized") + puts "local_svm_kernel must be \"weighted_tanimoto\" or \"propositionalized\"" + exit 1 + end + end + + if ! args[9] == "" + if ! (args[9] == "true") + puts "nr_hits must be \"true\"" + exit 1 + end + end + + if ! args[10] == "" + if ! (args[10] == "true") + puts "conf_stdev must be \"true\"" + exit 1 + end + end + + + + #if !dataset_is_uri + # # Upload a dataset + # training_dataset = OpenTox::Dataset.create_from_csv_file(file, subjectid) + # prediction_feature = training_dataset.features.keys[0] + # training_dataset_uri=training_dataset.uri + # puts prediction_feature + #else + training_dataset_uri=file + puts training_dataset_uri + prediction_feature = OpenTox::Dataset.find(training_dataset_uri).features.keys.first + puts prediction_feature + # end + puts training_dataset_uri + + + # Crossvalidation + # @param [Hash] params (required:algorithm_uri,dataset_uri,prediction_feature, optional:algorithm_params,num_folds(10),random_seed(1),stratified(false)) + alg_params = "feature_generation_uri=#{args[1]}"; + alg_params = alg_params << ";min_frequency=#{args[2]}" unless args[2]=="" + alg_params = alg_params << ";min_chisq_significance=#{args[3]}" unless args[3]=="" + alg_params = alg_params << ";backbone=#{args[4]}" unless args[4]=="" + alg_params = alg_params << ";prediction_algorithm=#{args[7]}" unless args[7]=="" + alg_params = alg_params << ";local_svm_kernel=#{args[8]}" unless args[8]=="" + alg_params = alg_params << ";nr_hits=#{args[9]}" unless args[9]=="" + alg_params = alg_params << ";conf_stdev=#{args[10]}" unless args[10]=="" + + stratified_param = args[5] + random_seed_param = args[6] + + cv_args = {:dataset_uri => training_dataset_uri, :prediction_feature => prediction_feature, :algorithm_uri => args[1].split('fminer')[0] + "lazar", :algorithm_params => alg_params, :stratified => stratified_param } + cv_args[:random_seed] = random_seed_param unless random_seed_param == "" + puts file + puts cv_args.to_yaml + puts + begin + lazar_single_args = {} + lazar_single_args[:feature_generation_uri] = "#{args[1]}"; + lazar_single_args[:min_frequency] = args[2] unless args[2]=="" + lazar_single_args[:min_chisq_significance] = args[3] unless args[3]=="" + lazar_single_args[:backbone] = args[4] unless args[4]=="" + lazar_single_args[:prediction_algorithm] = args[7] unless args[7]=="" + lazar_single_args[:local_svm_kernel] = args[8] unless args[8]=="" + lazar_single_args[:nr_hits] = args[9] unless args[9]=="" + lazar_single_args[:conf_stdev] = args[10] unless args[10]=="" + #m = OpenTox::Algorithm::Lazar.new.run({:dataset_uri => training_dataset_uri, :subjectid => subjectid}.merge lazar_single_args ).to_s + #puts m + cv = OpenTox::Crossvalidation.create(cv_args).uri + puts cv + cvr = OpenTox::CrossvalidationReport.create( cv , subjectid).uri + puts cvr + #qmrfr = OpenTox::QMRFReport.create(m).uri + #puts qmrfr + #cv_stat = OpenTox::Validation.from_cv_statistics( cv, subjectid ) + #puts cv_stat.metadata.to_yaml + #[ cv_stat, training_dataset_uri ] + rescue Exception => e + puts "cv failed: #{e.message} #{e.backtrace}" + end + +end diff --git a/5x_cv/wrapper5cv.sh b/5x_cv/wrapper5cv.sh new file mode 100755 index 0000000..2155635 --- /dev/null +++ b/5x_cv/wrapper5cv.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Wrapper Skript for CV +# Set Factors, Datasets, Exceptions in the respective config_files +# AM, 2011 + +if [ $# -lt 2 ]; then + echo "Usage: $0 factors datasets" + exit +fi + +# Configure basics +source $HOME/.bash_aliases +otconfig +THIS_DATE=`date +%Y%m%d_%H_` +FACTORS="$1" +DATASETS="$2" + +# Don't start when running +while ps x | grep 5x | grep -v grep >/dev/null 2>&1; do sleep 3; done + +LOGFILE="$THIS_DATE""$USER""_wrapper5cv.log" +rm "$LOGFILE" >/dev/null 2>&1 + +cat $DATASETS | while read dataset_uri; do + if ! [[ "$dataset_uri" =~ "#" ]]; then # allow comments + cat $FACTORS | while read factor; do + if ! [[ "$factor" =~ "#" ]]; then # allow comments + echo "${THIS_DATE}: $factor" >> $LOGFILE>&1 + factor="$factor;dataset_uri=$dataset_uri" + echo "ruby 5x_crossvalidation.rb $factor" >> $LOGFILE 2>&1 + ruby 5x_crossvalidation.rb $factor >> $LOGFILE 2>&1 + fi + done + else + echo >> $LOGFILE 2>&1 + echo $dataset_uri >> $LOGFILE 2>&1 + fi +done -- cgit v1.2.3