summaryrefslogtreecommitdiff
path: root/bbrc-sample/bbrc_sample_dv.rb
blob: 6a7c167a2fec194e0c7dbfc50430307869980adc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# # Author: Andreas Maunz, David Vorgrimmler

require 'rubygems'
require 'opentox-ruby'
require 'yaml'

if ARGV.size != 6 
  puts "Args: path/to/dataset.yaml ds_name num_boots backbone min_frequency method"
  puts ARGV.size
  exit
end

path = ARGV[0]
ds_file = path.split("/").last

if File.exists?(path)
  puts "[#{Time.now.iso8601(4).to_s}] #{ds_file} exists."
else
  puts "#{ds_file} does not exist."
  exit
end

subjectid = nil

ds_name = ARGV[1] # e.g. MOU
num_boots = ARGV[2] # e.g. electronic,cpsa or nil to disable
backbone = ARGV[3] # true/false
min_freq = ARGV[4] # [100, 90, ..., 10]
method = ARGV[5] # MLE, MEAN, BBRC
hits = false

ds = YAML::load_file("#{path}")
ds_uri = ds[ds_name]["dataset"]

result1 = []
result2 = []
metadata = []

for i in 1..50
  puts
  puts "--------------------------- Round: #{i} ---------------------------"

  # SPLIT
  puts "                       ----- split ds -----"
  split_params = {}
  split_params["dataset_uri"] = ds_uri
  split_params["prediction_feature"] = (ds_uri.to_s + "/feature/1")
  split_params["stratified"] = true 
  split_params["split_ratio"] = 0.5
  split_params["random_seed"] = i
  puts "[#{Time.now.iso8601(4).to_s}] Split params: #{split_params.to_yaml}"

  split_result = OpenTox::RestClientWrapper.post( File.join(CONFIG[:services]["opentox-validation"],"plain_training_test_split"), split_params)
  datasets = {}
  datasets[:training_ds] = split_result.inspect.gsub(/"/,'').split("\\n")[0]
  datasets[:test_ds] = split_result.inspect.gsub(/"/,'').split("\\n")[1]
  puts "[#{Time.now.iso8601(4).to_s}] Split result: #{datasets.to_yaml}"
  puts

  # BBRC sample
  puts "                ----- bbrc feature calulation -----"
  algo_params = {}
  algo_params["dataset_uri"] = datasets[:training_ds]
  algo_params["backbone"] = backbone
  algo_params["min_frequency"] = min_freq
  algo_params["nr_hits"] = hits
  algo_params["method"] = method

  t = Time.now
  if method == "bbrc"
    puts "[#{Time.now.iso8601(4).to_s}] BBRC params: #{algo_params.to_yaml}"
    feature_dataset_uri = OpenTox::RestClientWrapper.post( File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc"), algo_params )
  else
    algo_params["num_boots"] = num_boots
    algo_params["random_seed"] = i
    puts "[#{Time.now.iso8601(4).to_s}] BBRC params: #{algo_params.to_yaml}"
    feature_dataset_uri = OpenTox::RestClientWrapper.post( File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc/sample"), algo_params )
  end
  duration = Time.now - t
  puts "[#{Time.now.iso8601(4).to_s}] BBRC duration: #{duration}"
  puts "[#{Time.now.iso8601(4).to_s}] BBRC result: #{feature_dataset_uri}"
  puts

  # Match
  puts "                      ----- bbrc match -----"
  match_params = {}
  match_params["feature_dataset_uri"] = "#{feature_dataset_uri}"
  match_params["dataset_uri"] = datasets[:test_ds]
  match_params["min_frequency"] = min_freq
  match_params["nr_hits"] = hits
  puts "[#{Time.now.iso8601(4).to_s}] Match params: #{match_params.to_yaml}"

  matched_dataset_uri = OpenTox::RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"],"fminer","bbrc","match"),match_params)
  puts "[#{Time.now.iso8601(4).to_s}] BBRC match result: #{matched_dataset_uri}"
  puts

  # Compare pValues
  puts "                 ----- pValue comparision -----"
  bbrc_ds = OpenTox::Dataset.find(feature_dataset_uri)
  bbrc_smarts_pValues = {}
  bbrc_ds.features.each do |f, values|
    if values[RDF::type].include?(OT.Substructure)
      bbrc_smarts_pValues[values[OT::smarts]] =  values[OT::pValue]
    end
  end 

  match_ds = OpenTox::Dataset.find(matched_dataset_uri)
  matched_smarts_pValues = {}
  match_ds.features.each do |f, values|
    if values[RDF::type].include?(OT.Substructure)
      matched_smarts_pValues[values[OT::smarts]] =  values[OT::pValue]
    end
  end

  sum_E1 = 0.0
  sum_E2 = 0.0
  bbrc_smarts_pValues.each do |s, p|
    if matched_smarts_pValues.include?(s)
      dif = (p.to_f - matched_smarts_pValues[s].to_f).abs
      sum_E1 = sum_E1 + dif 
      sum_E2 = sum_E1 + dif**2
    end
  end 
  puts "[#{Time.now.iso8601(4).to_s}] Sum pValue difference (E1): #{sum_E1}"
  puts "[#{Time.now.iso8601(4).to_s}] Squared sum pValue difference (E2): #{sum_E2}"
 
  # Save data 
  result1 << sum_E1
  result2 << sum_E2
  
  info = []
  info << { :ds_name => ds_name, :nr_features => bbrc_ds.features.size} 
  info << split_params
  info << algo_params
  info << match_params
  
  metadata << info

  #  ds = OpenTox::Dataset.find(datasets[:training_ds])
  #  ds_nr_de = ds.data_entries.size
  #  ds_nr_com = ds.compounds.size
  #
  #  ds_result = OpenTox::Dataset.find(result_uri)
  #  ds_result_nr_de = ds_result.data_entries.size
  #  ds_result_nr_com = ds_result.compounds.size
  #  ds_result_nr_f = ds_result.features.size
  #
  #  min_sampling_support = ds_result.metadata[OT::parameters][2][OT::paramValue]
  #  num_boots = ds_result.metadata[OT::parameters][3][OT::paramValue] 
  #  min_frequency_per_sample = ds_result.metadata[OT::parameters][4][OT::paramValue]
  #  nr_hits = ds_result.metadata[OT::parameters][5][OT::paramValue]
  #  merge_time = ds_result.metadata[OT::parameters][6][OT::paramValue]
  #  n_stripped_mss = ds_result.metadata[OT::parameters][7][OT::paramValue]
  #  n_stripped_cst = ds_result.metadata[OT::parameters][8][OT::paramValue]
  #  random_seed = ds_result.metadata[OT::parameters][9][OT::paramValue]
  #
  #  puts "[#{Time.now.iso8601(4).to_s}] nr dataentries: #{ds_result_nr_de} , (of #{ds_nr_de} )"
  #  puts "[#{Time.now.iso8601(4).to_s}] nr dataentries: #{ds_result_nr_com} , (of #{ds_nr_com} )"
  #  puts "[#{Time.now.iso8601(4).to_s}] nr features: #{ds_result_nr_f}"
  #  puts "[#{Time.now.iso8601(4).to_s}] Merge time: #{merge_time}"
  #
  #  puts "=hyperlink(\"#{ds_uri}\";\"#{ds_name}\"),#{num_boots},#{min_sampling_support},#{min_frequency_per_sample},#{nr_hits},=hyperlink(\"#{result_uri}\";\"bbrc_result\"),#{ds_result_nr_com},#{ds_nr_com},#{ds_result_nr_f},#{duration},#{merge_time},#{n_stripped_mss},#{n_stripped_cst},#{random_seed}"

  puts

end

puts "############################################"
puts "############# FINAL RESULTS ################"
puts "############################################"
puts
puts "[#{Time.now.iso8601(4).to_s}] metadata: #{metadata.to_yaml}"
puts
puts "[#{Time.now.iso8601(4).to_s}] result1: #{result1.to_yaml}"
puts
puts "[#{Time.now.iso8601(4).to_s}] result2: #{result2.to_yaml}"