From 45e48d03d0f294f10e4b5eec3fb519dcb2851624 Mon Sep 17 00:00:00 2001 From: davor Date: Tue, 10 Jul 2012 16:28:12 +0200 Subject: Updated code. Note: Code moved to https://github.com/davor/bbrc-sample-client --- bbrc-sample/bbrc_sample_dv.rb | 142 +++++++++++++++++++++++++++++++++++++----- bbrc-sample/factors_config_dv | 62 +++++++++--------- 2 files changed, 157 insertions(+), 47 deletions(-) diff --git a/bbrc-sample/bbrc_sample_dv.rb b/bbrc-sample/bbrc_sample_dv.rb index e3c268d..6bb1ddd 100644 --- a/bbrc-sample/bbrc_sample_dv.rb +++ b/bbrc-sample/bbrc_sample_dv.rb @@ -3,22 +3,83 @@ require 'rubygems' require 'opentox-ruby' require 'yaml' +require 'csv' -if ARGV.size != 9 - puts "Args: path/to/dataset.yaml ds_name num_boots backbone min_frequency method find_min_frequency start_seed end_seed" +def check_params(args, dataset_names) + if ! (dataset_names.include? args[1]) + puts "dataset name has to exist in dataset.yaml" + exit 1 + end + + if args[2].to_i <= 2 + puts "num_boots must be a natural number higher than 30" + exit 1 + end + + if args[3].to_s != "true" && args[3].to_s != "false" + puts "backbone must be 'true' or 'false'." + exit 1 + end + + if args[4].gsub(/[pmc]/, '').to_i <= 0 + puts "min_frequency must be a natural number X (optional with description Xpm or Xpc)" + exit 1 + end + + if ! (['bbrc', 'mean', 'mle'].include? args[5]) + puts "method must be 'bbrc', 'mean' or 'mle'" + exit 1 + end + + if args[6].to_s != "true" && args[6].to_s != "false" + puts "find_min_frequency must be 'true' or 'false'" + exit 1 + end + + if args[7].to_i < 1 + puts "start_seed must be a natural number" + exit 1 + end + + if args[8].to_i < 1 + puts "end_seed must be a natural number" + exit 1 + end + + if args[7].to_i > args[8].to_i + puts "start_seed has to be smaller than end_seed" + exit 1 + end + + if ! (args[9].to_f <= 0.9 && args[9].to_f >= 0.1) + puts "split_ratio must be between 0.1 and 0.9" + exit 1 + end + + if ! (args[10].to_f <= 0.1 && args[10].to_f >= 0.0005) + puts "time_per_cmpd must be between 0.0005 and 0.1" + exit 1 + end +end + +if ARGV.size != 11 + puts "Args: path/to/dataset.yaml ds_name num_boots backbone min_frequency method find_min_frequency start_seed end_seed split_ratio time_per_cmpd" puts ARGV.size exit end path = ARGV[0] ds_file = path.split("/").last - if File.exists?(path) puts "[#{Time.now.iso8601(4).to_s}] #{ds_file} exists." else puts "#{ds_file} does not exist." exit end +ds = YAML::load_file("#{path}") +ds_names = ds.keys + +check_params(ARGV, ds_names) subjectid = nil @@ -26,19 +87,17 @@ ds_name = ARGV[1] # e.g. MOU,RAT num_boots = ARGV[2] # integer, 100 recommended backbone = ARGV[3] # true/false min_freq = ARGV[4] # integer -method = ARGV[5] # mle, mean, bbrc +method = ARGV[5] # mle, mean or bbrc find_min_frequency = ARGV[6] # true/false -start_seed = ARGV[7] # integer (< end_seed) -end_seed = ARGV[8] #integer (> start_seed) +start_seed = ARGV[7].to_i # integer (<= end_seed) +end_seed = ARGV[8].to_i #integer (>= start_seed) +split_ratio = ARGV[9].to_f # float, default 0.5 (>=0.1 and <=0.9) +time_per_cmpd = ARGV[10].to_f # float, 0.003 (secounds) recommended but this is only an experience value. hits = false -if start_seed > end_seed - puts "Start_seed has to be smaller than end_seed. " -end - -ds = YAML::load_file("#{path}") ds_uri = ds[ds_name]["dataset"] +finished_rounds = 0 result1 = [] result2 = [] metadata = [] @@ -54,10 +113,14 @@ statistics[:merge_time] = [] statistics[:n_stripped_mss] = [] statistics[:n_stripped_cst] = [] +$stdout.flush + begin for i in start_seed..end_seed puts puts "--------------------------- Round: #{i} ---------------------------" + $stdout.flush + del_ds = [] ################################# # SPLIT @@ -67,7 +130,7 @@ begin split_params["dataset_uri"] = ds_uri split_params["prediction_feature"] = (ds_uri.to_s + "/feature/1") split_params["stratified"] = true - split_params["split_ratio"] = 0.5 + split_params["split_ratio"] = split_ratio split_params["random_seed"] = i puts "[#{Time.now.iso8601(4).to_s}] Split params: #{split_params.to_yaml}" @@ -75,8 +138,10 @@ begin datasets = {} datasets[:training_ds] = split_result.inspect.gsub(/"/,'').split("\\n")[0] datasets[:test_ds] = split_result.inspect.gsub(/"/,'').split("\\n")[1] + del_ds = del_ds + datasets.values puts "[#{Time.now.iso8601(4).to_s}] Split result: #{datasets.to_yaml}" puts + $stdout.flush ################################# # FIND "good" min_frequency @@ -106,12 +171,13 @@ begin result_uri = OpenTox::RestClientWrapper.post( File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc/"), min_params ) durations << Time.now - t ds_result = OpenTox::Dataset.find(result_uri) + del_ds << ds_result.uri ds_result_nr_f = ds_result.features.size end # puts # puts "----- Main phase: -----" - max_duration = durations[0] +(ds_nr_com.to_f * 0.003) # this is only an experience value. + max_duration = durations[0] +(ds_nr_com.to_f * time_per_cmpd) min_params["min_frequency"] = y y = y_old found = false @@ -127,6 +193,7 @@ begin result_uri = OpenTox::RestClientWrapper.post( File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc/"), min_params ) durations << Time.now - t ds_result = OpenTox::Dataset.find(result_uri) + del_ds << ds_result.uri ds_result_nr_f = ds_result.features.size # Check if number of features is max half and min one-tenth of the number of compounds and performed in accaptable amount of time if ds_result_nr_f.to_i < (ds_nr_com/2).to_i && ds_result_nr_f.to_i > (ds_nr_com/10).to_i @@ -169,6 +236,7 @@ begin puts "[#{Time.now.iso8601(4).to_s}] BBRC duration: #{bbrc_duration}" puts "[#{Time.now.iso8601(4).to_s}] BBRC result: #{feature_dataset_uri}" puts + $stdout.flush ################################# # MATCH @@ -184,12 +252,15 @@ begin matched_dataset_uri = OpenTox::RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"],"fminer","bbrc","match"),match_params) puts "[#{Time.now.iso8601(4).to_s}] BBRC match result: #{matched_dataset_uri}" puts + $stdout.flush ################################# # COMPARE pValues ################################# puts " ----- pValue comparision -----" bbrc_ds = OpenTox::Dataset.find(feature_dataset_uri) + bbrc_ds.save(subjectid) + del_ds << bbrc_ds.uri bbrc_smarts_pValues = {} bbrc_ds.features.each do |f, values| if values[RDF::type].include?(OT.Substructure) @@ -209,13 +280,16 @@ begin sum_E2 = 0.0 bbrc_smarts_pValues.each do |s, p| if matched_smarts_pValues.include?(s) - dif = (p.to_f - matched_smarts_pValues[s].to_f).abs + dif = (p.to_f - matched_smarts_pValues[s].to_f) sum_E1 = sum_E1 + dif - sum_E2 = sum_E2 + dif**2 + sum_E2 = sum_E2 + dif.abs end - end + end + sum_E1 = sum_E1/bbrc_smarts_pValues.size + sum_E2 = sum_E2/bbrc_smarts_pValues.size puts "[#{Time.now.iso8601(4).to_s}] Sum pValue difference (E1): #{sum_E1}" puts "[#{Time.now.iso8601(4).to_s}] Squared sum pValue difference (E2): #{sum_E2}" + $stdout.flush ################################# # SAVE data @@ -248,8 +322,28 @@ begin metadata << info puts + finished_rounds += 1 + del_ds.each do |del_ds_uri| + ds = OpenTox::Dataset.find(del_ds_uri, subjectid) + end + $stdout.flush + end + + ################################# + # Create CSV result + ################################# + csv_file_name = "bbrc_sample_#{ds_name}_#{method}_#{start_seed}_#{(start_seed + finished_rounds)-1}_results.csv" + if File.exists?(csv_file_name) + csv_file_name = csv_file_name + Time.now.usec.to_s end + CSV.open(csv_file_name, 'w') do |writer| + writer << ['E1', 'E2'] + for i in 0..(result1.size-1) + writer << [result1[i], result2[i]] + end + end + min_sampling_support = (statistics[:min_sampling_support].inject{|sum,x| sum + x })/(statistics[:min_sampling_support].size) unless statistics[:min_sampling_support].compact.empty? min_frequency_per_sample = (statistics[:min_frequency_per_sample].inject{|sum,x| sum + x })/(statistics[:min_frequency_per_sample].size) unless statistics[:min_frequency_per_sample].compact.empty? bbrc_ds_nr_com = (statistics[:bbrc_ds_nr_com].inject{|sum,x| sum + x })/(statistics[:bbrc_ds_nr_com].size) unless statistics[:bbrc_ds_nr_com].compact.empty? @@ -285,6 +379,21 @@ rescue Exception => e LOGGER.debug "#{e.class}: #{e.message}" LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ################################# + # Create CSV result + ################################# + csv_file_name = "bbrc_sample_#{ds_name}_#{method}_#{start_seed}_#{start_seed + finished_rounds}_results.csv" + if File.exists?(csv_file_name) + csv_file_name = csv_file_name + Time.now.usec.to_s + end + + CSV.open(csv_file_name, 'w') do |writer| + writer << ['E1', 'E2'] + for i in 0..result1.size + writer << [result1[i], result2[i]] + end + end + min_sampling_support = (statistics[:min_sampling_support].inject{|sum,x| sum + x })/(statistics[:min_sampling_support].size) unless statistics[:min_sampling_support].compact.empty? min_frequency_per_sample = (statistics[:min_frequency_per_sample].inject{|sum,x| sum + x })/(statistics[:min_frequency_per_sample].size) unless statistics[:min_frequency_per_sample].compact.empty? bbrc_ds_nr_com = (statistics[:bbrc_ds_nr_com].inject{|sum,x| sum + x })/(statistics[:bbrc_ds_nr_com].size) unless statistics[:bbrc_ds_nr_com].compact.empty? @@ -316,3 +425,4 @@ rescue Exception => e puts "[#{Time.now.iso8601(4).to_s}] result2: #{result2.to_yaml}" end + diff --git a/bbrc-sample/factors_config_dv b/bbrc-sample/factors_config_dv index 127c34b..d3e586d 100644 --- a/bbrc-sample/factors_config_dv +++ b/bbrc-sample/factors_config_dv @@ -1,31 +1,31 @@ -#Dataset num_boots backbone min_frequence method find_min_frequency start_seed end_seed -#MOU 100 true 20pm mle true 1 50 -#MOU 100 true 20pm mean true 1 50 -#MOU 100 true 20pm bbrc true 1 50 -#MOU 100 false 30pm mle true 1 50 -#MOU 100 false 30pm mean true 1 50 -#MOU 100 false 30pm bbrc true 1 50 -#RAT 100 true 20pm mle true 1 50 -#RAT 100 true 20pm mean true 1 50 -#RAT 100 true 20pm bbrc true 1 50 -#RAT 100 false 40pm mle true 1 50 -#RAT 100 false 40pm mean true 1 50 -#RAT 100 false 40pm bbrc true 1 50 -#MCC 100 true 20pm mle true 1 50 -#MCC 100 true 20pm mean true 1 50 -#MCC 100 true 20pm bbrc true 1 50 -#MCC 100 false 50pm mle true 1 50 -#MCC 100 false 50pm mean true 1 50 -#MCC 100 false 50pm bbrc true 1 50 -#SAL 100 true 40pm mle true 1 50 -#SAL 100 true 40pm mean true 1 50 -#SAL 100 true 40pm bbrc true 1 50 -#SAL 100 false 70pm mle true 1 50 -#SAL 100 false 70pm mean true 1 50 -#SAL 100 false 70pm bbrc true 1 50 -#KAZ 100 true 30pm mle true 1 50 -#KAZ 100 true 30pm mean true 1 50 -#KAZ 100 true 30pm bbrc true 1 50 -#KAZ 100 false 90pm mle true 1 50 -#KAZ 100 false 90pm mean true 1 50 -#KAZ 100 false 90pm bbrc true 1 50 +#Dataset num_boots backbone min_frequence method find_min_frequency start_seed end_seed split_ratio time_per_cmpd +#MOU 50 true 20pm mle true 1 100 0.5 0.003 +#MOU 50 true 20pm mean true 1 100 0.5 0.003 +#MOU 50 true 20pm bbrc true 1 100 0.5 0.003 +#MOU 50 false 30pm mle true 1 100 0.5 0.003 +#MOU 50 false 30pm mean true 1 100 0.5 0.003 +#MOU 50 false 30pm bbrc true 1 100 0.5 0.003 +#RAT 50 true 20pm mle true 1 100 0.5 0.003 +#RAT 50 true 20pm mean true 1 100 0.5 0.003 +#RAT 50 true 20pm bbrc true 1 100 0.5 0.003 +#RAT 50 false 40pm mle true 1 100 0.5 0.003 +#RAT 50 false 40pm mean true 1 100 0.5 0.003 +#RAT 50 false 40pm bbrc true 1 100 0.5 0.003 +#MCC 50 true 20pm mle true 1 100 0.5 0.003 +#MCC 50 true 20pm mean true 1 100 0.5 0.003 +#MCC 50 true 20pm bbrc true 1 100 0.5 0.003 +#MCC 50 false 100pm mle true 1 100 0.5 0.003 +#MCC 50 false 100pm mean true 1 100 0.5 0.003 +#MCC 50 false 100pm bbrc true 1 100 0.5 0.003 +#SAL 50 true 40pm mle true 1 100 0.5 0.003 +#SAL 50 true 40pm mean true 1 100 0.5 0.003 +#SAL 50 true 40pm bbrc true 1 100 0.5 0.003 +#SAL 50 false 70pm mle true 1 100 0.5 0.003 +#SAL 50 false 70pm mean true 1 100 0.5 0.003 +#SAL 50 false 70pm bbrc true 1 100 0.5 0.003 +#KAZ 50 true 30pm mle true 1 100 0.5 0.003 +#KAZ 50 true 30pm mean true 1 100 0.5 0.003 +#KAZ 50 true 30pm bbrc true 1 100 0.5 0.003 +#KAZ 50 false 90pm mle true 1 100 0.5 0.003 +#KAZ 50 false 90pm mean true 1 100 0.5 0.003 +#KAZ 50 false 90pm bbrc true 1 100 0.5 0.003 -- cgit v1.2.3