summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordavor <vorgrimmlerdavid@gmx.de>2012-07-10 16:28:12 +0200
committerdavor <vorgrimmlerdavid@gmx.de>2012-07-10 16:28:12 +0200
commit45e48d03d0f294f10e4b5eec3fb519dcb2851624 (patch)
treed1ac4ad73b93012557b699627d1c7fa02e642719
parent54a5b6dc2849a17af83a5fe081c5749ad9203081 (diff)
Updated code.development
Note: Code moved to https://github.com/davor/bbrc-sample-client
-rw-r--r--bbrc-sample/bbrc_sample_dv.rb142
-rw-r--r--bbrc-sample/factors_config_dv62
2 files changed, 157 insertions, 47 deletions
diff --git a/bbrc-sample/bbrc_sample_dv.rb b/bbrc-sample/bbrc_sample_dv.rb
index e3c268d..6bb1ddd 100644
--- a/bbrc-sample/bbrc_sample_dv.rb
+++ b/bbrc-sample/bbrc_sample_dv.rb
@@ -3,22 +3,83 @@
require 'rubygems'
require 'opentox-ruby'
require 'yaml'
+require 'csv'
-if ARGV.size != 9
- puts "Args: path/to/dataset.yaml ds_name num_boots backbone min_frequency method find_min_frequency start_seed end_seed"
+def check_params(args, dataset_names)
+ if ! (dataset_names.include? args[1])
+ puts "dataset name has to exist in dataset.yaml"
+ exit 1
+ end
+
+ if args[2].to_i <= 2
+ puts "num_boots must be a natural number higher than 30"
+ exit 1
+ end
+
+ if args[3].to_s != "true" && args[3].to_s != "false"
+ puts "backbone must be 'true' or 'false'."
+ exit 1
+ end
+
+ if args[4].gsub(/[pmc]/, '').to_i <= 0
+ puts "min_frequency must be a natural number X (optional with description Xpm or Xpc)"
+ exit 1
+ end
+
+ if ! (['bbrc', 'mean', 'mle'].include? args[5])
+ puts "method must be 'bbrc', 'mean' or 'mle'"
+ exit 1
+ end
+
+ if args[6].to_s != "true" && args[6].to_s != "false"
+ puts "find_min_frequency must be 'true' or 'false'"
+ exit 1
+ end
+
+ if args[7].to_i < 1
+ puts "start_seed must be a natural number"
+ exit 1
+ end
+
+ if args[8].to_i < 1
+ puts "end_seed must be a natural number"
+ exit 1
+ end
+
+ if args[7].to_i > args[8].to_i
+ puts "start_seed has to be smaller than end_seed"
+ exit 1
+ end
+
+ if ! (args[9].to_f <= 0.9 && args[9].to_f >= 0.1)
+ puts "split_ratio must be between 0.1 and 0.9"
+ exit 1
+ end
+
+ if ! (args[10].to_f <= 0.1 && args[10].to_f >= 0.0005)
+ puts "time_per_cmpd must be between 0.0005 and 0.1"
+ exit 1
+ end
+end
+
+if ARGV.size != 11
+ puts "Args: path/to/dataset.yaml ds_name num_boots backbone min_frequency method find_min_frequency start_seed end_seed split_ratio time_per_cmpd"
puts ARGV.size
exit
end
path = ARGV[0]
ds_file = path.split("/").last
-
if File.exists?(path)
puts "[#{Time.now.iso8601(4).to_s}] #{ds_file} exists."
else
puts "#{ds_file} does not exist."
exit
end
+ds = YAML::load_file("#{path}")
+ds_names = ds.keys
+
+check_params(ARGV, ds_names)
subjectid = nil
@@ -26,19 +87,17 @@ ds_name = ARGV[1] # e.g. MOU,RAT
num_boots = ARGV[2] # integer, 100 recommended
backbone = ARGV[3] # true/false
min_freq = ARGV[4] # integer
-method = ARGV[5] # mle, mean, bbrc
+method = ARGV[5] # mle, mean or bbrc
find_min_frequency = ARGV[6] # true/false
-start_seed = ARGV[7] # integer (< end_seed)
-end_seed = ARGV[8] #integer (> start_seed)
+start_seed = ARGV[7].to_i # integer (<= end_seed)
+end_seed = ARGV[8].to_i #integer (>= start_seed)
+split_ratio = ARGV[9].to_f # float, default 0.5 (>=0.1 and <=0.9)
+time_per_cmpd = ARGV[10].to_f # float, 0.003 (secounds) recommended but this is only an experience value.
hits = false
-if start_seed > end_seed
- puts "Start_seed has to be smaller than end_seed. "
-end
-
-ds = YAML::load_file("#{path}")
ds_uri = ds[ds_name]["dataset"]
+finished_rounds = 0
result1 = []
result2 = []
metadata = []
@@ -54,10 +113,14 @@ statistics[:merge_time] = []
statistics[:n_stripped_mss] = []
statistics[:n_stripped_cst] = []
+$stdout.flush
+
begin
for i in start_seed..end_seed
puts
puts "--------------------------- Round: #{i} ---------------------------"
+ $stdout.flush
+ del_ds = []
#################################
# SPLIT
@@ -67,7 +130,7 @@ begin
split_params["dataset_uri"] = ds_uri
split_params["prediction_feature"] = (ds_uri.to_s + "/feature/1")
split_params["stratified"] = true
- split_params["split_ratio"] = 0.5
+ split_params["split_ratio"] = split_ratio
split_params["random_seed"] = i
puts "[#{Time.now.iso8601(4).to_s}] Split params: #{split_params.to_yaml}"
@@ -75,8 +138,10 @@ begin
datasets = {}
datasets[:training_ds] = split_result.inspect.gsub(/"/,'').split("\\n")[0]
datasets[:test_ds] = split_result.inspect.gsub(/"/,'').split("\\n")[1]
+ del_ds = del_ds + datasets.values
puts "[#{Time.now.iso8601(4).to_s}] Split result: #{datasets.to_yaml}"
puts
+ $stdout.flush
#################################
# FIND "good" min_frequency
@@ -106,12 +171,13 @@ begin
result_uri = OpenTox::RestClientWrapper.post( File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc/"), min_params )
durations << Time.now - t
ds_result = OpenTox::Dataset.find(result_uri)
+ del_ds << ds_result.uri
ds_result_nr_f = ds_result.features.size
end
# puts
# puts "----- Main phase: -----"
- max_duration = durations[0] +(ds_nr_com.to_f * 0.003) # this is only an experience value.
+ max_duration = durations[0] +(ds_nr_com.to_f * time_per_cmpd)
min_params["min_frequency"] = y
y = y_old
found = false
@@ -127,6 +193,7 @@ begin
result_uri = OpenTox::RestClientWrapper.post( File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc/"), min_params )
durations << Time.now - t
ds_result = OpenTox::Dataset.find(result_uri)
+ del_ds << ds_result.uri
ds_result_nr_f = ds_result.features.size
# Check if number of features is max half and min one-tenth of the number of compounds and performed in accaptable amount of time
if ds_result_nr_f.to_i < (ds_nr_com/2).to_i && ds_result_nr_f.to_i > (ds_nr_com/10).to_i
@@ -169,6 +236,7 @@ begin
puts "[#{Time.now.iso8601(4).to_s}] BBRC duration: #{bbrc_duration}"
puts "[#{Time.now.iso8601(4).to_s}] BBRC result: #{feature_dataset_uri}"
puts
+ $stdout.flush
#################################
# MATCH
@@ -184,12 +252,15 @@ begin
matched_dataset_uri = OpenTox::RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"],"fminer","bbrc","match"),match_params)
puts "[#{Time.now.iso8601(4).to_s}] BBRC match result: #{matched_dataset_uri}"
puts
+ $stdout.flush
#################################
# COMPARE pValues
#################################
puts " ----- pValue comparision -----"
bbrc_ds = OpenTox::Dataset.find(feature_dataset_uri)
+ bbrc_ds.save(subjectid)
+ del_ds << bbrc_ds.uri
bbrc_smarts_pValues = {}
bbrc_ds.features.each do |f, values|
if values[RDF::type].include?(OT.Substructure)
@@ -209,13 +280,16 @@ begin
sum_E2 = 0.0
bbrc_smarts_pValues.each do |s, p|
if matched_smarts_pValues.include?(s)
- dif = (p.to_f - matched_smarts_pValues[s].to_f).abs
+ dif = (p.to_f - matched_smarts_pValues[s].to_f)
sum_E1 = sum_E1 + dif
- sum_E2 = sum_E2 + dif**2
+ sum_E2 = sum_E2 + dif.abs
end
- end
+ end
+ sum_E1 = sum_E1/bbrc_smarts_pValues.size
+ sum_E2 = sum_E2/bbrc_smarts_pValues.size
puts "[#{Time.now.iso8601(4).to_s}] Sum pValue difference (E1): #{sum_E1}"
puts "[#{Time.now.iso8601(4).to_s}] Squared sum pValue difference (E2): #{sum_E2}"
+ $stdout.flush
#################################
# SAVE data
@@ -248,8 +322,28 @@ begin
metadata << info
puts
+ finished_rounds += 1
+ del_ds.each do |del_ds_uri|
+ ds = OpenTox::Dataset.find(del_ds_uri, subjectid)
+ end
+ $stdout.flush
+ end
+
+ #################################
+ # Create CSV result
+ #################################
+ csv_file_name = "bbrc_sample_#{ds_name}_#{method}_#{start_seed}_#{(start_seed + finished_rounds)-1}_results.csv"
+ if File.exists?(csv_file_name)
+ csv_file_name = csv_file_name + Time.now.usec.to_s
end
+ CSV.open(csv_file_name, 'w') do |writer|
+ writer << ['E1', 'E2']
+ for i in 0..(result1.size-1)
+ writer << [result1[i], result2[i]]
+ end
+ end
+
min_sampling_support = (statistics[:min_sampling_support].inject{|sum,x| sum + x })/(statistics[:min_sampling_support].size) unless statistics[:min_sampling_support].compact.empty?
min_frequency_per_sample = (statistics[:min_frequency_per_sample].inject{|sum,x| sum + x })/(statistics[:min_frequency_per_sample].size) unless statistics[:min_frequency_per_sample].compact.empty?
bbrc_ds_nr_com = (statistics[:bbrc_ds_nr_com].inject{|sum,x| sum + x })/(statistics[:bbrc_ds_nr_com].size) unless statistics[:bbrc_ds_nr_com].compact.empty?
@@ -285,6 +379,21 @@ rescue Exception => e
LOGGER.debug "#{e.class}: #{e.message}"
LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ #################################
+ # Create CSV result
+ #################################
+ csv_file_name = "bbrc_sample_#{ds_name}_#{method}_#{start_seed}_#{start_seed + finished_rounds}_results.csv"
+ if File.exists?(csv_file_name)
+ csv_file_name = csv_file_name + Time.now.usec.to_s
+ end
+
+ CSV.open(csv_file_name, 'w') do |writer|
+ writer << ['E1', 'E2']
+ for i in 0..result1.size
+ writer << [result1[i], result2[i]]
+ end
+ end
+
min_sampling_support = (statistics[:min_sampling_support].inject{|sum,x| sum + x })/(statistics[:min_sampling_support].size) unless statistics[:min_sampling_support].compact.empty?
min_frequency_per_sample = (statistics[:min_frequency_per_sample].inject{|sum,x| sum + x })/(statistics[:min_frequency_per_sample].size) unless statistics[:min_frequency_per_sample].compact.empty?
bbrc_ds_nr_com = (statistics[:bbrc_ds_nr_com].inject{|sum,x| sum + x })/(statistics[:bbrc_ds_nr_com].size) unless statistics[:bbrc_ds_nr_com].compact.empty?
@@ -316,3 +425,4 @@ rescue Exception => e
puts "[#{Time.now.iso8601(4).to_s}] result2: #{result2.to_yaml}"
end
+
diff --git a/bbrc-sample/factors_config_dv b/bbrc-sample/factors_config_dv
index 127c34b..d3e586d 100644
--- a/bbrc-sample/factors_config_dv
+++ b/bbrc-sample/factors_config_dv
@@ -1,31 +1,31 @@
-#Dataset num_boots backbone min_frequence method find_min_frequency start_seed end_seed
-#MOU 100 true 20pm mle true 1 50
-#MOU 100 true 20pm mean true 1 50
-#MOU 100 true 20pm bbrc true 1 50
-#MOU 100 false 30pm mle true 1 50
-#MOU 100 false 30pm mean true 1 50
-#MOU 100 false 30pm bbrc true 1 50
-#RAT 100 true 20pm mle true 1 50
-#RAT 100 true 20pm mean true 1 50
-#RAT 100 true 20pm bbrc true 1 50
-#RAT 100 false 40pm mle true 1 50
-#RAT 100 false 40pm mean true 1 50
-#RAT 100 false 40pm bbrc true 1 50
-#MCC 100 true 20pm mle true 1 50
-#MCC 100 true 20pm mean true 1 50
-#MCC 100 true 20pm bbrc true 1 50
-#MCC 100 false 50pm mle true 1 50
-#MCC 100 false 50pm mean true 1 50
-#MCC 100 false 50pm bbrc true 1 50
-#SAL 100 true 40pm mle true 1 50
-#SAL 100 true 40pm mean true 1 50
-#SAL 100 true 40pm bbrc true 1 50
-#SAL 100 false 70pm mle true 1 50
-#SAL 100 false 70pm mean true 1 50
-#SAL 100 false 70pm bbrc true 1 50
-#KAZ 100 true 30pm mle true 1 50
-#KAZ 100 true 30pm mean true 1 50
-#KAZ 100 true 30pm bbrc true 1 50
-#KAZ 100 false 90pm mle true 1 50
-#KAZ 100 false 90pm mean true 1 50
-#KAZ 100 false 90pm bbrc true 1 50
+#Dataset num_boots backbone min_frequence method find_min_frequency start_seed end_seed split_ratio time_per_cmpd
+#MOU 50 true 20pm mle true 1 100 0.5 0.003
+#MOU 50 true 20pm mean true 1 100 0.5 0.003
+#MOU 50 true 20pm bbrc true 1 100 0.5 0.003
+#MOU 50 false 30pm mle true 1 100 0.5 0.003
+#MOU 50 false 30pm mean true 1 100 0.5 0.003
+#MOU 50 false 30pm bbrc true 1 100 0.5 0.003
+#RAT 50 true 20pm mle true 1 100 0.5 0.003
+#RAT 50 true 20pm mean true 1 100 0.5 0.003
+#RAT 50 true 20pm bbrc true 1 100 0.5 0.003
+#RAT 50 false 40pm mle true 1 100 0.5 0.003
+#RAT 50 false 40pm mean true 1 100 0.5 0.003
+#RAT 50 false 40pm bbrc true 1 100 0.5 0.003
+#MCC 50 true 20pm mle true 1 100 0.5 0.003
+#MCC 50 true 20pm mean true 1 100 0.5 0.003
+#MCC 50 true 20pm bbrc true 1 100 0.5 0.003
+#MCC 50 false 100pm mle true 1 100 0.5 0.003
+#MCC 50 false 100pm mean true 1 100 0.5 0.003
+#MCC 50 false 100pm bbrc true 1 100 0.5 0.003
+#SAL 50 true 40pm mle true 1 100 0.5 0.003
+#SAL 50 true 40pm mean true 1 100 0.5 0.003
+#SAL 50 true 40pm bbrc true 1 100 0.5 0.003
+#SAL 50 false 70pm mle true 1 100 0.5 0.003
+#SAL 50 false 70pm mean true 1 100 0.5 0.003
+#SAL 50 false 70pm bbrc true 1 100 0.5 0.003
+#KAZ 50 true 30pm mle true 1 100 0.5 0.003
+#KAZ 50 true 30pm mean true 1 100 0.5 0.003
+#KAZ 50 true 30pm bbrc true 1 100 0.5 0.003
+#KAZ 50 false 90pm mle true 1 100 0.5 0.003
+#KAZ 50 false 90pm mean true 1 100 0.5 0.003
+#KAZ 50 false 90pm bbrc true 1 100 0.5 0.003