From 63320057e2a2b2121c5c405c31e2e7b709fa9e44 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 31 Jan 2012 12:37:36 +0100 Subject: adapt validation to r-util (split), add new r plots --- lib/merge.rb | 10 ++-- lib/r-util.rb | 82 -------------------------- lib/stratification.R | 123 --------------------------------------- report/environment.rb | 13 +++++ report/plot_factory.rb | 55 +++++++++++++++++ report/report_content.rb | 85 ++++++++++++++++++++++++++- report/report_factory.rb | 74 ++++++++++++++++++----- report/report_service.rb | 4 +- report/statistical_test.rb | 49 +--------------- report/validation_access.rb | 60 ++++++++++++++++--- report/validation_data.rb | 23 +++++++- validation/validation_service.rb | 74 +++++++++++------------ 12 files changed, 325 insertions(+), 327 deletions(-) delete mode 100644 lib/r-util.rb delete mode 100644 lib/stratification.R diff --git a/lib/merge.rb b/lib/merge.rb index f30a3c1..bc6e1a7 100644 --- a/lib/merge.rb +++ b/lib/merge.rb @@ -31,6 +31,11 @@ module Lib return merge_count(object)>1 end + def self.merge_count( object ) + @@merge_count[object] = 1 if @@merge_count[object]==nil + return @@merge_count[object] + end + def self.merge_objects( object1, object2 ) raise "classes not equal : "+object1.class.to_s+" != "+object2.class.to_s if object1.class != object2.class object_class = object1.class @@ -137,11 +142,6 @@ module Lib {:value => value, :variance => variance } end - def self.merge_count( object ) - @@merge_count[object] = 1 if @@merge_count[object]==nil - return @@merge_count[object] - end - def self.set_merge_count(object, merge_count) @@merge_count[object] = merge_count end diff --git a/lib/r-util.rb b/lib/r-util.rb deleted file mode 100644 index 0d58389..0000000 --- a/lib/r-util.rb +++ /dev/null @@ -1,82 +0,0 @@ -# pending: package dir hack --------- -# CONFIG[:base_dir] = "/home//opentox-ruby/www" -# PACKAGE_DIR = "/home//opentox-ruby/r-packages" -package_dir = CONFIG[:base_dir].split("/") -package_dir[-1] = "r-packages" -package_dir = package_dir.join("/") -PACKAGE_DIR = package_dir - - - -module Lib - - module RUtil - - def self.dataset_to_dataframe( dataset ) - LOGGER.debug "convert dataset to dataframe #{dataset.uri}" - all_features = [] - dataset.features.each do |f| - feat_name = "feature_#{f[0].split("/")[-1]}" - LOGGER.debug "- adding feature: #{feat_name}" - feat = OpenTox::Feature.find(f[0]) - nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature) - values = [] - dataset.compounds.each do |c| - val = dataset.data_entries[c][f[0]] - raise "not yet implemented" if val!=nil && val.size>1 - v = val==nil ? "" : val[0].to_s - v = "NA" if v.size()==0 - values << v - end - all_features << feat_name - @@r.assign feat_name,values - @@r.eval "#{feat_name} <- as.numeric(#{feat_name})" unless nominal - end - df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" - cmd = "#{df_name} <- data.frame(#{all_features.join(",")})" - @@r.eval cmd - #@@r.eval "head(#{df_name})" - df_name - end - - def self.stratified_split( dataframe, pct=0.3, seed=42 ) - @@r.eval "set.seed(#{seed})" - @@r.eval "split <- stratified_split(#{dataframe}, ratio=#{pct})" - split = @@r.pull 'split' - split.collect{|s| s.to_i} - end - - def self.package_installed?( package ) - @@r.eval ".libPaths(\"#{PACKAGE_DIR}\")" - p = @@r.pull "installed.packages()[,1]" - p.include?(package) - end - - def self.install_packages( package ) - unless package_installed? package - @@r.eval "install.packages(\"#{package}\", repos=\"http://cran.r-project.org\", dependencies=T, lib=\"#{PACKAGE_DIR}\")" - end - end - - def self.library( package ) - install_packages( package ) - @@r.eval "library(\"#{package}\")" - end - - def self.init_r - @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r - library("sampling") - library("gam") - @@r.eval "source(\"#{PACKAGE_DIR}/stratification.R\")" - end - - def self.quit_r - begin - @@r.quit - @@r = nil - rescue - end - end - - end -end diff --git a/lib/stratification.R b/lib/stratification.R deleted file mode 100644 index 9aa8d1f..0000000 --- a/lib/stratification.R +++ /dev/null @@ -1,123 +0,0 @@ -library("sampling") -library("gam") - -nominal_to_binary <- function( orig_data ) -{ - data = as.data.frame( orig_data ) - result = NULL - for (i in 1:ncol(data)) - { - #print(i) - if (is.numeric( data[,i] ) ) - { - if (is.null(result)) - result = data.frame(data[,i]) - else - result = data.frame(result, data[,i]) - colnames(result)[ncol(result)] <- colnames(data)[i] - } - else - { - vals = unique(data[,i]) - for (j in 1:length(vals)) - { - #print(j) - bins = c() - for (k in 1:nrow(data)) - { - if(data[,i][k] == vals[j]) - bins = c(bins,1) - else - bins = c(bins,0) - } - #print(bins) - if (is.null(result)) - result = data.frame(bins) - else - result = data.frame(result, bins) - colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j]) - if (length(vals)==2) break - } - } - } - result -} - -process_data <- function( data ) -{ - if (!is.numeric(data)) - data.num = nominal_to_binary(data) - else - data.num = data - if(any(is.na(data.num))) - data.repl = na.gam.replace(data.num) - else - data.repl = data.num - data.repl -} - -stratified_split <- function( data, ratio=0.3 ) -{ - data.processed = as.matrix(process_data( data )) - pik = rep(ratio,times=nrow(data.processed)) - data.strat = cbind(pik,data.processed) - samplecube(data.strat,pik,order=2,comment=F) -} - -stratified_k_fold_split <- function( data, num_folds=10 ) -{ - print(paste(num_folds,"-fold-split, data-size",nrow(data))) - data.processed = as.matrix(process_data( data )) - folds = rep(0, times=nrow(data)) - for (i in 1:(num_folds-1)) - { - prop = 1/(num_folds-(i-1)) - print(paste("fold",i,"/",num_folds," prop",prop)) - pik = rep(prop,times=nrow(data)) - for (j in 1:nrow(data)) - if(folds[j]!=0) - pik[j]=0 - data.strat = cbind(pik,data.processed) - s<-samplecube(data.strat,pik,order=2,comment=F) - print(paste("fold size: ",sum(s))) - for (j in 1:nrow(data)) - if (s[j] == 1) - folds[j]=i - } - for (j in 1:nrow(data)) - if (folds[j] == 0) - folds[j]=num_folds - folds -} - -plot_split <- function( data, split ) -{ - data.processed = process_data( data ) - data.pca <- prcomp(data.processed, scale=TRUE) - data.2d =as.data.frame(data.pca$x)[1:2] - plot( NULL, - xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), - xlab = "pc 1", ylab = "pc 2") - for (j in 0:max(split)) - { - set = c() - for (i in 1:nrow(data)) - if (split[i] == j) - set = c(set,i) - points(data.2d[set,], pch = 2, col=(j+1)) - } -} - -#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) -#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) -#data<-rbind(a,b) -#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) -#data<-rbind(data,c) -#data=iris -#split = stratified_k_fold_split(data, num_folds=3) -#split = stratified_split(data, ratio=0.3) -#plot_split(data,split) - - - - diff --git a/report/environment.rb b/report/environment.rb index 34554f7..7addc45 100755 --- a/report/environment.rb +++ b/report/environment.rb @@ -6,6 +6,19 @@ end gem 'ruby-plot', "~>0.6.0" module Reports + + def self.r_util + @@r_util = OpenTox::RUtil.new unless defined?@@r_util and @@r_util + @@r_util + end + + def self.quit_r + if defined?@@r_util and @@r_util + @@r_util.quit_r + @@r_util = nil + end + end + end require "lib/ot_predictions.rb" diff --git a/report/plot_factory.rb b/report/plot_factory.rb index 2d7946f..f114dd3 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -100,6 +100,34 @@ module Reports omit_count end + def self.create_train_test_plot( out_files, validation_set, only_prediction_feature, waiting_task ) + if only_prediction_feature + train = [] + test = [] + validation_set.validations.each do |v| + [[v.test_dataset_uri, test, v.test_target_dataset_uri], + [v.training_dataset_uri, train, v.training_dataset_uri]].each do |uri,array,uri2| + d = Lib::DatasetCache.find(uri, validation_set.validations[0].subjectid) + d2 = Lib::DatasetCache.find((uri2 ? uri2 : uri), validation_set.validations[0].subjectid) + d.compounds.each do |c| + d2.data_entries[c][v.prediction_feature].each do |val| + array << val + end if d2.data_entries[c] and d2.data_entries[c][v.prediction_feature] + end + end + end + waiting_task.progress(50) if waiting_task + + numerical = validation_set.unique_feature_type=="regression" + Reports::r_util.double_hist_plot(out_files, train, test, numerical, numerical, "Training Data", "Test Data", + "Prediction Feature Distribution", validation_set.validations.first.prediction_feature ) + else + Reports::r_util.feature_value_plot(out_files, validation_set.validations[0].training_feature_dataset_uri, + validation_set.validations[0].test_feature_dataset_uri, "Training Data", "Test Data", + nil, true, validation_set.validations[0].subjectid, waiting_task ) + end + end + # creates a roc plot (result is plotted into out_file) # * if (split_set_attributes == nil?) @@ -193,6 +221,33 @@ module Reports end end + def self.create_box_plot( out_files, validation_set, title_attribute, value_attribute, class_value ) + + out_files = [out_files] unless out_files.is_a?(Array) + LOGGER.debug "creating box plot, out-files:"+out_files.inspect + + data = {} + validation_set.validations.each do |v| + value = v.send(value_attribute) + if value.is_a?(Hash) + if class_value==nil + avg_value = 0 + value.values.each{ |val| avg_value+=val } + value = avg_value/value.values.size.to_f + else + raise "box plot value is hash, but no entry for class-value ("+class_value.to_s+ + "); value for "+value_attribute.to_s+" -> "+value.inspect unless value.key?(class_value) + value = value[class_value] + end + end + + data[v.send(title_attribute).to_s] = [] unless data[v.send(title_attribute).to_s] + data[v.send(title_attribute).to_s] << value + end + + Reports::r_util.boxplot( out_files, data) + end + def self.create_bar_plot( out_files, validation_set, title_attribute, value_attributes ) out_files = [out_files] unless out_files.is_a?(Array) diff --git a/report/report_content.rb b/report/report_content.rb index 3d92b52..80473c5 100755 --- a/report/report_content.rb +++ b/report/report_content.rb @@ -61,7 +61,6 @@ class Reports::ReportContent test_matrix[:num_results].to_s, table, true, true) end end - Reports::ReportStatisticalTest.quit_r end def add_predictions( validation_set, @@ -183,6 +182,39 @@ class Reports::ReportContent align_last_two_images section_title+" in logarithmic and linear scale (values <= 0 are omitted in logarithmic scale)" end + def add_train_test_plot( validation_set, + only_prediction_feature, + waiting_task, + section_title="Training Test Distribution Plot", + section_text=nil, + image_title=nil) + + section_plot = @current_section + prediction_set = validation_set.collect{ |v| v.get_predictions } + @xml_report.add_paragraph(section_plot, section_text) if section_text + + begin + plot_png = add_tmp_file("train_test_plot_#{only_prediction_feature}", "png") + plot_svg = add_tmp_file("train_test_plot_#{only_prediction_feature}", "svg") + omit_count = Reports::PlotFactory.create_train_test_plot( [plot_png[:path], plot_svg[:path]], + prediction_set, only_prediction_feature, waiting_task ) + unless image_title + if only_prediction_feature + image_title = "Prediction Feature: #{validation_set.validations.first.prediction_feature}" + else + image_title = "Features Excluding Prediction Feature" + end + end + @xml_report.add_imagefigure(section_plot, image_title, plot_png[:name], "PNG", 100, plot_svg[:name]) + rescue Exception => ex + LOGGER.error("Could not create train test plot: "+ex.message) + rm_tmp_file(plot_png[:name]) if plot_png + rm_tmp_file(plot_svg[:name]) if plot_svg + @xml_report.add_paragraph(section_plot, "could not create train test plot: "+ex.message) + end + + end + def add_roc_plot( validation_set, accept_value, split_set_attribute=nil, @@ -317,6 +349,57 @@ class Reports::ReportContent @xml_report.add_imagefigure(section_bar, image_title, plot_png[:name], "PNG", 100, plot_svg[:name]) end + def add_box_plot(validation_set, + title_attribute, + value_attributes, + section_title="Boxplots", + section_text=nil) + + section_box = @xml_report.add_section(@current_section, section_title) + @xml_report.add_paragraph(section_box, section_text) if section_text + + plot_png = nil; plot_svg = nil + begin + plot_input = [] + value_attributes.each do |a| + accept = validation_set.get_accept_values_for_attr(a) + if accept and accept.size>0 + accept.each do |c| + title = a.to_s.gsub("_","-") + ( (accept.size==1 || c==nil) ? "" : "("+c.to_s+")" ) + plot_input << [a,c,title] + end + else + plot_input << [a,nil,a.to_s.gsub("_","-")] + end + end + + i = 0 + figs = [] + plot_input.each do |attrib,class_value,image_title| + plot_png = add_tmp_file("box_plot#{i}", "png") + plot_svg = add_tmp_file("box_plot#{i}", "svg") + Reports::PlotFactory.create_box_plot([plot_png[:path], plot_svg[:path]], + validation_set, title_attribute, attrib, class_value ) + figs << @xml_report.imagefigure(image_title, plot_png[:name], + "PNG", 50, plot_svg[:name]) + plot_png = nil; plot_svg = nil + i += 1 + end + + i = 1 + figs.each_slice(4) do |f| + @xml_report.add_imagefigures_in_row(section_box,f,"Boxplots #{i}") + i+=1 + end + rescue Exception => ex + msg = "WARNING could not create box plot: "+ex.message + LOGGER.error(msg) + rm_tmp_file(plot_png[:name]) if plot_png + rm_tmp_file(plot_svg[:name]) if plot_svg + @xml_report.add_paragraph(section_box, msg) + end + end + private def add_tmp_file(name, extension) tmp_file_name = name.to_s+@tmp_file_count.to_s+"."+extension.to_s diff --git a/report/report_factory.rb b/report/report_factory.rb index 2b978c5..2bb74ee 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -11,10 +11,10 @@ VAL_ATTR_REGR = [ :num_instances, :num_unpredicted, :root_mean_squared_error, :weighted_root_mean_squared_error, :mean_absolute_error, :weighted_mean_absolute_error, :r_square, :weighted_r_square, :sample_correlation_coefficient ] -#VAL_ATTR_BAR_PLOT_CLASS = [ :accuracy, :average_area_under_roc, +#VAL_ATTR_BOX_PLOT_CLASS = [ :accuracy, :average_area_under_roc, # :area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate ] -VAL_ATTR_BAR_PLOT_CLASS = [ :accuracy, :f_measure, :true_positive_rate, :true_negative_rate, :positive_predictive_value, :negative_predictive_value ] -VAL_ATTR_BAR_PLOT_REGR = [ :root_mean_squared_error, :mean_absolute_error, :r_square ] +VAL_ATTR_BOX_PLOT_CLASS = [ :accuracy, :area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate, :positive_predictive_value, :negative_predictive_value ] +VAL_ATTR_BOX_PLOT_REGR = [ :root_mean_squared_error, :mean_absolute_error, :r_square ] VAL_ATTR_TTEST_REGR = [ :r_square, :root_mean_squared_error ] VAL_ATTR_TTEST_CLASS = [ :accuracy, :average_area_under_roc ] @@ -29,8 +29,9 @@ module Reports::ReportFactory RT_VALIDATION = "validation" RT_CV = "crossvalidation" RT_ALG_COMP = "algorithm_comparison" + RT_METHOD_COMP = "method_comparison" - REPORT_TYPES = [RT_VALIDATION, RT_CV, RT_ALG_COMP ] + REPORT_TYPES = [RT_VALIDATION, RT_CV, RT_ALG_COMP, RT_METHOD_COMP ] # creates a report of a certain type according to the validation data in validation_set # @@ -40,11 +41,13 @@ module Reports::ReportFactory def self.create_report(type, validation_set, params={}, task=nil) case type when RT_VALIDATION - create_report_validation(validation_set, task) + create_report_validation(validation_set, {}, task) when RT_CV - create_report_crossvalidation(validation_set, task) + create_report_crossvalidation(validation_set, {}, task) when RT_ALG_COMP create_report_compare_algorithms(validation_set, params, task) + when RT_METHOD_COMP + create_report_compare_methods(validation_set, params, task) else raise "unknown report type "+type.to_s end @@ -78,7 +81,7 @@ module Reports::ReportFactory raise OpenTox::BadRequestError.new("num validations is not equal to 1") unless validation_set.size==1 val = validation_set.validations[0] - pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) + pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,50) ) report = Reports::ReportContent.new("Validation report") add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil @@ -103,7 +106,6 @@ module Reports::ReportFactory report.add_confidence_plot(validation_set, :positive_predictive_value, accept_value) report.align_last_two_images "Confidence Plots" end - report.end_section when "regression" report.add_result(validation_set, [:validation_uri] + VAL_ATTR_TRAIN_TEST + VAL_ATTR_REGR, "Results", "Results") report.add_section("Plots") @@ -111,10 +113,13 @@ module Reports::ReportFactory report.add_confidence_plot(validation_set, :root_mean_squared_error, nil) report.add_confidence_plot(validation_set, :r_square, nil) report.align_last_two_images "Confidence Plots" - report.end_section end - task.progress(90) if task - + task.progress(70) if task + report.add_train_test_plot( validation_set, false, OpenTox::SubTask.create(task,70,80) ) + report.add_train_test_plot( validation_set, true, OpenTox::SubTask.create(task,80,90) ) + report.align_last_two_images "Training Test Data Distribution Plots" + report.end_section + report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") report.add_predictions( validation_set ) task.progress(100) if task @@ -248,11 +253,11 @@ module Reports::ReportFactory when "classification" result_attributes += VAL_ATTR_CLASS ttest_attributes = VAL_ATTR_TTEST_CLASS - bar_plot_attributes = VAL_ATTR_BAR_PLOT_CLASS + box_plot_attributes = VAL_ATTR_BOX_PLOT_CLASS else result_attributes += VAL_ATTR_REGR ttest_attributes = VAL_ATTR_TTEST_REGR - bar_plot_attributes = VAL_ATTR_BAR_PLOT_REGR + box_plot_attributes = VAL_ATTR_BOX_PLOT_REGR end if params[:ttest_attributes] and params[:ttest_attributes].chomp.size>0 @@ -263,8 +268,8 @@ module Reports::ReportFactory ttest_significance = params[:ttest_significance].to_f end - bar_plot_attributes += ttest_attributes - bar_plot_attributes.uniq! + box_plot_attributes += ttest_attributes + box_plot_attributes.uniq! result_attributes += ttest_attributes result_attributes.uniq! @@ -287,13 +292,50 @@ module Reports::ReportFactory res_text = "These performance statistics have been derieved by computing the mean of the statistics on each crossvalidation fold." report.add_result(merged,result_attributes,res_titel,res_titel,res_text) # pending: regression stats have different scales!!! - report.add_bar_plot(merged, :identifier, bar_plot_attributes) if validation_set.unique_feature_type=="classification" + report.add_box_plot(set, :identifier, box_plot_attributes) report.add_paired_ttest_tables(set, :identifier, ttest_attributes, ttest_significance) if ttest_significance>0 report.end_section end task.progress(100) if task report end + + def self.create_report_compare_methods(validation_set, params={}, task=nil) + raise OpenTox::BadRequestError.new("num validations is not >1") unless validation_set.size>1 + raise OpenTox::BadRequestError.new("validations must have unique feature type, i.e. must be either all regression, "+ + "or all classification validations") unless validation_set.unique_feature_type + raise OpenTox::BadRequestError.new("number of different identifiers <2: "+ + validation_set.get_values(:identifier).inspect) if validation_set.num_different_values(:identifier)<2 + #validation_set.load_cv_attributes + + pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) + report = Reports::ReportContent.new("Method comparison report") + add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil + + result_attributes = [:identifier,:validation_uri,:validation_report_uri]+VAL_ATTR_CV-[:crossvalidation_fold,:num_folds,:dataset_uri] + case validation_set.unique_feature_type + when "classification" + result_attributes += VAL_ATTR_CLASS + box_plot_attributes = VAL_ATTR_BOX_PLOT_CLASS + else + result_attributes += VAL_ATTR_REGR + box_plot_attributes = VAL_ATTR_BOX_PLOT_REGR + end + + merged = validation_set.merge([:identifier]) + merged.sort(:identifier) + + merged.validations.each do |v| + v.validation_uri = v.validation_uri.split(";").uniq.join(" ") + v.validation_report_uri = v.validation_report_uri.split(";").uniq.join(" ") if v.validation_report_uri + end + + msg = merged.validations.collect{|v| v.identifier+" ("+Lib::MergeObjects.merge_count(v).to_s+"x)"}.join(", ") + report.add_result(merged,result_attributes,"Average Results","Results",msg) + + report.add_box_plot(validation_set, :identifier, box_plot_attributes) + report + end end diff --git a/report/report_service.rb b/report/report_service.rb index 53a17ab..f315b04 100644 --- a/report/report_service.rb +++ b/report/report_service.rb @@ -89,7 +89,9 @@ module Reports report_content = Reports::ReportFactory.create_report(type, validation_set, params, OpenTox::SubTask.create(task,10,90)) LOGGER.debug "report created" - + Reports::quit_r + Reports.validation_access.delete_tmp_resources(subjectid) + #step 3: persist report if creation not failed id = @@persistance.new_report(report_content, type, create_meta_data(type, validation_set, validation_uris), self, subjectid) LOGGER.debug "report persisted with id: '"+id.to_s+"'" diff --git a/report/statistical_test.rb b/report/statistical_test.rb index 4d85555..da46f6b 100644 --- a/report/statistical_test.rb +++ b/report/statistical_test.rb @@ -1,38 +1,6 @@ #require "rubygems" #require "rinruby" -module LIB - class StatisticalTest - - # -1 -> array1 < array2 - # 0 -> not difference - # 1 -> array2 > array1 - # - def self.pairedTTest(array1, array2, significance_level=0.95) - - @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r - @@r.assign "v1",array1 - @@r.assign "v2",array2 - @@r.eval "ttest = t.test(v1,v2,paired=T)" - t = @@r.pull "ttest$statistic" - p = @@r.pull "ttest$p.value" - if (1-significance_level > p) - t - else - 0 - end - end - - def self.quit_r - begin - @@r.quit - @@r = nil - rescue - end - end - end -end - module Reports class ReportStatisticalTest @@ -72,27 +40,12 @@ module Reports array1 = validations1.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } array2 = validations2.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } LOGGER.debug "paired-t-testing "+attribute.to_s+" "+array1.inspect+" vs "+array2.inspect - LIB::StatisticalTest.pairedTTest(array1, array2, significance_level) + Reports::r_util.paired_ttest(array1, array2, significance_level) end - def self.quit_r - LIB::StatisticalTest.quit_r - end - end end -#x=["1.36840891838074", "2.89500403404236", "2.58440494537354", "1.96544003486633", "1.4017288684845", "1.68250012397766", "1.65089893341064", "2.24862003326416", "3.73909902572632", "2.36335206031799"] -#y=["1.9675121307373", "2.30981087684631", "2.59359288215637", "2.62243509292603", "1.98700189590454", "2.26789593696594", "2.03917217254639", "2.69466996192932", "1.96487307548523", "1.65820598602295"] -#puts LIB::StatisticalTest.pairedTTest(x,y) -# -##t1 = Time.new -##10.times do -# puts LIB::StatisticalTest.pairedTTest([1.01,2,3,4,5,12,4,2],[2,3,3,3,56,3,4,5]) -##end -#LIB::StatisticalTest.quit_r -##t2 = Time.new -##puts t2-t1 diff --git a/report/validation_access.rb b/report/validation_access.rb index 2cb0c6f..aaa7bdc 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -7,6 +7,8 @@ require "lib/validation_db.rb" # class Reports::ValidationDB + @@tmp_resources = [] + def same_service?(uri) self_uri = URI.parse($url_provider.url) val_uri = URI.parse(uri) @@ -132,27 +134,67 @@ class Reports::ValidationDB validation.send("#{p.to_s}=".to_sym, cv.send(p.to_s)) end end + + def training_feature_dataset_uri(validation, subjectid) + m = OpenTox::Model::Generic.find(validation.model_uri, subjectid) + if m + f = m.metadata[OT.featureDataset] + return f.chomp if f + end + raise "no feature dataset found" + end + def test_feature_dataset_uri(validation, subjectid) + m = OpenTox::Model::Generic.find(validation.model_uri, subjectid) + feat_gen = nil + m.metadata[OT.parameters].each do |h| + if h[DC.title] and h[DC.title]=~/feature_generation/ and h[OT.paramValue] + feat_gen = h[OT.paramValue] + break + end + end if m and m.metadata[OT.parameters] + raise "no feature creation alg found" unless feat_gen + feat_gen = File.join(feat_gen,"match") if feat_gen=~/fminer/ + uri = OpenTox::RestClientWrapper.post(feat_gen,{:subjectid => subjectid, + :feature_dataset_uri=>training_feature_dataset_uri(validation,subjectid), + :dataset_uri=>validation.test_dataset_uri}) + @@tmp_resources << uri + uri + end + + def delete_tmp_resources(subjectid) + @@tmp_resources.each do |uri| + OpenTox::RestClientWrapper.delete uri,{:subjectid=>subjectid} + end + @@tmp_resources = [] + end + def get_predictions(validation, filter_params, subjectid, task) # we need compound info, cannot reuse stored prediction data data = Lib::PredictionData.create( validation.feature_type, validation.test_dataset_uri, validation.test_target_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri, - validation.predicted_variable, validation.predicted_confidence, subjectid, task ) + validation.predicted_variable, validation.predicted_confidence, subjectid, OpenTox::SubTask.create(task, 0, 80 ) ) data = Lib::PredictionData.filter_data( data.data, data.compounds, filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions] ) if filter_params!=nil + task.progress(100) if task Lib::OTPredictions.new( data.data, data.compounds ) end def get_accept_values( validation, subjectid=nil ) # PENDING So far, one has to load the whole dataset to get the accept_value from ambit - test_target_dataset = validation.test_target_dataset_uri - test_target_dataset = validation.test_dataset_uri unless test_target_dataset - d = Lib::DatasetCache.find( test_target_dataset, subjectid ) - raise "cannot get test target dataset for accept values, dataset: "+test_target_dataset.to_s unless d - accept_values = d.accept_values(validation.prediction_feature) - raise "cannot get accept values from dataset "+test_target_dataset.to_s+" for feature "+ - validation.prediction_feature+":\n"+d.features[validation.prediction_feature].to_yaml unless accept_values!=nil - accept_values + test_target_datasets = validation.test_target_dataset_uri + test_target_datasets = validation.test_dataset_uri unless test_target_datasets + res = nil + test_target_datasets.split(";").each do |test_target_dataset| + d = Lib::DatasetCache.find( test_target_dataset, subjectid ) + raise "cannot get test target dataset for accept values, dataset: "+test_target_dataset.to_s unless d + accept_values = d.accept_values(validation.prediction_feature) + raise "cannot get accept values from dataset "+test_target_dataset.to_s+" for feature "+ + validation.prediction_feature+":\n"+d.features[validation.prediction_feature].to_yaml unless accept_values!=nil + raise "different accept values" if res && res!=accept_values + res = accept_values + end + res end def feature_type( validation, subjectid=nil ) diff --git a/report/validation_data.rb b/report/validation_data.rb index eea5229..3806fd7 100755 --- a/report/validation_data.rb +++ b/report/validation_data.rb @@ -94,6 +94,7 @@ module Reports @subjectid = subjectid raise unless filter_params==nil || filter_params.is_a?(Hash) @filter_params = filter_params + @created_resources = [] #raise "subjectid is nil" unless subjectid end @@ -102,7 +103,22 @@ module Reports Reports.validation_access.init_validation_from_cv_statistics(v, cv_uri, filter_params, subjectid) v end - + + def training_feature_dataset_uri + unless @training_feature_dataset + @training_feature_dataset = Reports.validation_access.training_feature_dataset_uri( self, @subjectid ) + end + @training_feature_dataset + end + + #hack this does create the features for the test dataset + def test_feature_dataset_uri + unless @test_feature_dataset + @test_feature_dataset = Reports.validation_access.test_feature_dataset_uri( self, @subjectid ) + end + @test_feature_dataset + end + # returns/creates predictions, cache to save rest-calls/computation time # # call-seq: @@ -402,12 +418,17 @@ module Reports end if variance + #puts "variance given #{a}, #{val.inspect}, #{val.class}, #{variance.inspect}, #{variance.class}" if (val.is_a?(Array)) raise "not implemented" elsif (val.is_a?(Hash)) val.collect{ |i,j| i.to_nice_s+": "+j.to_nice_s + " +- " + variance[i].to_nice_s }.join(", ") else + if (variance.is_a?(Hash)) + raise "invalid variance" unless accept_values.size==1 && accept_values[0]!=nil + variance = variance[accept_values[0]] + end val.to_nice_s + " +- " + variance.to_nice_s end else diff --git a/validation/validation_service.rb b/validation/validation_service.rb index dceead9..614363d 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -2,7 +2,6 @@ require "lib/validation_db.rb" require "lib/ot_predictions.rb" -require "lib/r-util.rb" require "validation/validation_format.rb" @@ -639,16 +638,10 @@ module Validation end if stratified - Lib::RUtil.init_r - df = Lib::RUtil.dataset_to_dataframe( orig_dataset ) - split = Lib::RUtil.stratified_split( df, split_ratio, random_seed ) - Lib::RUtil.quit_r - raise "internal error" unless split.size==orig_dataset.compounds.size - task.progress(33) if task - - training_compounds = [] - split.size.times{|i| training_compounds << orig_dataset.compounds[i] if split[i]==1} - test_compounds = orig_dataset.compounds - training_compounds + r_util = OpenTox::RUtil.new + split_sets = r_util.stratified_split( orig_dataset, "NA", df, split_ratio, random_seed ) + r_util.quit_r + result = {:training_dataset_uri => split_sets[0], :test_dataset_uri => split_sets[1]} else compounds = orig_dataset.compounds raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 @@ -661,37 +654,36 @@ module Validation compounds.shuffle!( random_seed ) training_compounds = compounds[0..split] test_compounds = compounds[(split+1)..-1] + task.progress(33) if task + + result = {} + result[:training_dataset_uri] = orig_dataset.split( training_compounds, + orig_dataset.features.keys, + { DC.title => "Training dataset split of "+orig_dataset.title.to_s, + DC.creator => $url_provider.url_for('/training_test_split',:full) }, + subjectid ).uri + task.progress(66) if task + + result[:test_dataset_uri] = orig_dataset.split( test_compounds, + orig_dataset.features.keys.dclone - [prediction_feature], + { DC.title => "Test dataset split of "+orig_dataset.title.to_s, + DC.creator => $url_provider.url_for('/training_test_split',:full) }, + subjectid ).uri + task.progress(100) if task + + if !stratified and ENV['RACK_ENV'] =~ /test|debug/ + raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless + Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) + test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid + raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data + test_data.load_compounds subjectid + raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+ + test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split) + end + + LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" end - task.progress(33) if task - - result = {} - - result[:training_dataset_uri] = orig_dataset.split( training_compounds, - orig_dataset.features.keys, - { DC.title => "Training dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri - task.progress(66) if task - - result[:test_dataset_uri] = orig_dataset.split( test_compounds, - orig_dataset.features.keys.dclone - [prediction_feature], - { DC.title => "Test dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri - task.progress(100) if task - - if !stratified and ENV['RACK_ENV'] =~ /test|debug/ - raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless - Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) - test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid - raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data - test_data.load_compounds subjectid - raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+ - test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split) - end - - LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" - return result + result end end -- cgit v1.2.3