From ddc100289dd5ff412245bf7ee5d13a6fdb38853f Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 23 Nov 2011 15:57:10 +0100 Subject: add validation to pure-yaml code for serialization --- validation/validation_application.rb | 52 +++++++++++++++++++++++++++++++++--- validation/validation_format.rb | 4 +-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 7e0e10f..a80396c 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -153,9 +153,13 @@ get '/crossvalidation/:id' do "A crossvalidation resource." content_type "text/html" OpenTox.text_to_html crossvalidation.to_yaml,@subjectid,related_links,description + when "application/serialize" + content_type "application/serialize" + crossvalidation.get_content_as_hash # to load all the stuff + crossvalidation.to_yaml when /application\/x-yaml|\*\/\*/ content_type "application/x-yaml" - crossvalidation.to_yaml + crossvalidation.to_rdf_yaml else raise OpenTox::BadRequestError.new "MIME type '"+request.env['HTTP_ACCEPT'].to_s+"' not supported, valid Accept-Headers: \"application/rdf+xml\", \"application/x-yaml\", \"text/html\"." end @@ -172,13 +176,17 @@ get '/crossvalidation/:id/statistics' do description = "The averaged statistics for the crossvalidation." content_type "text/html" - OpenTox.text_to_html v.to_yaml,@subjectid,related_links,description + OpenTox.text_to_html v.to_rdf_yaml,@subjectid,related_links,description when "application/rdf+xml" content_type "application/rdf+xml" v.to_rdf + when "application/serialize" + content_type "application/serialize" + v.get_content_as_hash # to load all the stuff + v.to_yaml else content_type "application/x-yaml" - v.to_yaml + v.to_rdf_yaml end end @@ -562,6 +570,38 @@ post '/validate_datasets' do return_task(task) end +get '/:id/verify_r_square' do + + #PENDING: this is debug code, move to test-suite + + validation = Validation::Validation.get(params[:id]) + p = validation.compute_validation_stats_with_model(nil, true) + + puts "actual "+p.actual_values.inspect + puts "predicted "+p.predicted_values.inspect + puts "" + + puts "ot r-square "+p.r_square.to_s + puts "ot sample_correlation_coefficient "+p.sample_correlation_coefficient.to_s + puts "ot sample_correlation_coefficient**2 "+(p.sample_correlation_coefficient**2).to_s + puts "" + + @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r + @@r.assign "v1",p.actual_values + @@r.assign "v2",p.predicted_values + puts "r cor "+@@r.pull("cor(v1,v2)").to_s + # @@r.eval "ttest = t.test(v1,v2,paired=T)" + # t = @@r.pull "ttest$statistic" + @@r.eval "fit <- lm(v1 ~ v2)" + @@r.eval "sum <- summary(fit)" + puts "r r-square "+@@r.pull("sum$r.squared").to_s + puts "r adjusted-r-square "+@@r.pull("sum$adj.r.squared").to_s + + @@r.quit + @@r = nil + +end + get '/:id/predictions' do LOGGER.info "get validation predictions "+params.inspect begin @@ -627,9 +667,13 @@ get '/:id' do "All validations: "+url_for("/",:full)+"\n"+ "All validation reports: "+url_for("/report/validation",:full) OpenTox.text_to_html validation.to_yaml,@subjectid,related_links,description + when "application/serialize" + content_type "application/serialize" + validation.get_content_as_hash # to load all the stuff + validation.to_yaml else #default is yaml content_type "application/x-yaml" - validation.to_yaml + validation.to_rdf_yaml end end diff --git a/validation/validation_format.rb b/validation/validation_format.rb index 23b1996..3c9a8ef 100755 --- a/validation/validation_format.rb +++ b/validation/validation_format.rb @@ -67,7 +67,7 @@ module Validation s.to_rdfxml end - def to_yaml + def to_rdf_yaml get_content_as_hash.keys_to_rdf_format.keys_to_owl_uris.to_yaml end @@ -96,7 +96,7 @@ module Validation s.to_rdfxml end - def to_yaml + def to_rdf_yaml get_content_as_hash.keys_to_rdf_format.keys_to_owl_uris.to_yaml end end -- cgit v1.2.3 From d5a53dec3b7e97bb8749dc4cd9afefc08ee15768 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 23 Nov 2011 16:02:12 +0100 Subject: adjust yaml version for to-html --- validation/validation_application.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index a80396c..d3c4c2f 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -152,7 +152,7 @@ get '/crossvalidation/:id' do description = "A crossvalidation resource." content_type "text/html" - OpenTox.text_to_html crossvalidation.to_yaml,@subjectid,related_links,description + OpenTox.text_to_html crossvalidation.to_rdf_yaml,@subjectid,related_links,description when "application/serialize" content_type "application/serialize" crossvalidation.get_content_as_hash # to load all the stuff @@ -666,7 +666,7 @@ get '/:id' do "Get validation predictions: "+url_for("/"+params[:id]+"/predictions",:full)+"\n"+ "All validations: "+url_for("/",:full)+"\n"+ "All validation reports: "+url_for("/report/validation",:full) - OpenTox.text_to_html validation.to_yaml,@subjectid,related_links,description + OpenTox.text_to_html validation.to_rdf_yaml,@subjectid,related_links,description when "application/serialize" content_type "application/serialize" validation.get_content_as_hash # to load all the stuff -- cgit v1.2.3 From c16535924b2941e9e7316d982d8836cdb83e6b5a Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 23 Nov 2011 16:23:40 +0100 Subject: fix load validations for serialization --- validation/validation_application.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index d3c4c2f..4b6763a 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -155,7 +155,7 @@ get '/crossvalidation/:id' do OpenTox.text_to_html crossvalidation.to_rdf_yaml,@subjectid,related_links,description when "application/serialize" content_type "application/serialize" - crossvalidation.get_content_as_hash # to load all the stuff + crossvalidation.inspect # to load all the stuff crossvalidation.to_yaml when /application\/x-yaml|\*\/\*/ content_type "application/x-yaml" @@ -182,7 +182,7 @@ get '/crossvalidation/:id/statistics' do v.to_rdf when "application/serialize" content_type "application/serialize" - v.get_content_as_hash # to load all the stuff + v.inspect # to load all the stuff v.to_yaml else content_type "application/x-yaml" @@ -669,7 +669,7 @@ get '/:id' do OpenTox.text_to_html validation.to_rdf_yaml,@subjectid,related_links,description when "application/serialize" content_type "application/serialize" - validation.get_content_as_hash # to load all the stuff + validation.inspect # to load all the stuff validation.to_yaml else #default is yaml content_type "application/x-yaml" -- cgit v1.2.3 From 95703c1e7d3f6e98a200cf6dfd1cfef3a0ca0479 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 23 Nov 2011 17:12:14 +0100 Subject: enable reporting with external validations --- report/validation_access.rb | 94 ++++++++++++++++++++++++++++++--------------- report/validation_data.rb | 2 +- 2 files changed, 63 insertions(+), 33 deletions(-) diff --git a/report/validation_access.rb b/report/validation_access.rb index 299b124..3b5335c 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -7,27 +7,37 @@ require "lib/validation_db.rb" # class Reports::ValidationDB + def same_service?(uri) + self_uri = URI.parse($url_provider.url) + val_uri = URI.parse(uri) + self_uri.host == val_uri.host && self_uri.port == val_uri.port + end + def resolve_cv_uris(validation_uris, identifier=nil, subjectid=nil) res = {} count = 0 validation_uris.each do |u| + if u.to_s =~ /.*\/crossvalidation\/[0-9]+/ - cv_id = u.split("/")[-1].to_i cv = nil + cv_id = u.split("/")[-1].to_i + val_uris = nil - raise OpenTox::NotAuthorizedError.new "Not authorized: GET "+u.to_s if - AA_SERVER and !OpenTox::Authorization.authorized?(u,"GET",subjectid) -# begin -# #cv = Lib::Crossvalidation.find( cv_id ) -# rescue => ex -# raise "could not access crossvalidation with id "+validation_id.to_s+", error-msg: "+ex.message -# end - cv = Validation::Crossvalidation.get( cv_id ) - raise OpenTox::NotFoundError.new "crossvalidation with id "+cv_id.to_s+" not found" unless cv - raise OpenTox::BadRequestError.new("crossvalidation with id '"+cv_id.to_s+"' not finished") unless cv.finished - #res += Validation::Validation.find( :all, :conditions => { :crossvalidation_id => cv_id } ).collect{|v| v.validation_uri.to_s} - Validation::Validation.find( :crossvalidation_id => cv_id, :validation_type => "crossvalidation" ).each do |v| - res[v.validation_uri.to_s] = identifier ? identifier[count] : nil + if same_service?u + raise OpenTox::NotAuthorizedError.new "Not authorized: GET "+u.to_s if + AA_SERVER and !OpenTox::Authorization.authorized?(u,"GET",subjectid) + cv = Validation::Crossvalidation.get( cv_id ) + raise OpenTox::NotFoundError.new "crossvalidation with id "+cv_id.to_s+" not found" unless cv + raise OpenTox::BadRequestError.new("crossvalidation with id '"+cv_id.to_s+"' not finished") unless cv.finished + #res += Validation::Validation.find( :all, :conditions => { :crossvalidation_id => cv_id } ).collect{|v| v.validation_uri.to_s} + val_uris = Validation::Validation.find( :crossvalidation_id => cv_id, :validation_type => "crossvalidation" ).collect{|v| v.validation_uri.to_s} + else + val_base_uri = u.gsub(/\/crossvalidation\/[0-9]+/,"") + val_uris = OpenTox::RestClientWrapper.get( val_base_uri+"?crossvalidation_id="+cv_id.to_s+"&validation_type=crossvalidation", {:subjectid => subjectid, :accept => "text/uri-list" }).split("\n") + end + + val_uris.each do |v_uri| + res[v_uri] = identifier ? identifier[count] : nil end else res[u.to_s] = identifier ? identifier[count] : nil @@ -43,34 +53,49 @@ class Reports::ValidationDB validation_id = uri.split("/")[-1] raise OpenTox::BadRequestError.new "invalid validation id "+validation_id.to_s unless validation_id!=nil and (validation_id.to_i > 0 || validation_id.to_s=="0" ) + v = nil - raise OpenTox::NotAuthorizedError.new "Not authorized: GET "+uri.to_s if - AA_SERVER and !OpenTox::Authorization.authorized?(uri,"GET",subjectid) - v = Validation::Validation.get(validation_id) + + if same_service? uri + raise OpenTox::NotAuthorizedError.new "Not authorized: GET "+uri.to_s if + AA_SERVER and !OpenTox::Authorization.authorized?(uri,"GET",subjectid) + v = Validation::Validation.get(validation_id) + else + v = YAML::load(OpenTox::RestClientWrapper.get uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) + end raise OpenTox::NotFoundError.new "validation with id "+validation_id.to_s+" not found" unless v raise OpenTox::BadRequestError.new "validation with id "+validation_id.to_s+" is not finished yet" unless v.finished - (Validation::VAL_PROPS + Validation::VAL_CV_PROPS).each do |p| validation.send("#{p.to_s}=".to_sym, v.send(p)) end + # set uris manually, in case external validation is used + validation.validation_uri = uri + validation.crossvalidation_uri = uri.gsub(/\/[0-9]+/,"")+"/crossvalidation/"+validation.crossvalidation_id if validation.crossvalidation_id!=nil + {:classification_statistics => Validation::VAL_CLASS_PROPS, :regression_statistics => Validation::VAL_REGR_PROPS}.each do |subset_name,subset_props| subset = v.send(subset_name) - subset_props.each{ |prop| validation.send("#{prop.to_s}=".to_sym, subset[prop]) } if subset + subset_props.each{ |prop| validation.send("#{prop.to_s}=".to_sym, subset[prop]) } if subset end end def init_validation_from_cv_statistics( validation, cv_uri, subjectid=nil ) raise OpenTox::BadRequestError.new "not a crossvalidation uri: "+cv_uri.to_s unless cv_uri.uri? and cv_uri =~ /crossvalidation.*\/[0-9]+$/ - cv_id = cv_uri.split("/")[-1] - raise OpenTox::NotAuthorizedError.new "Not authorized: GET "+cv_uri.to_s if - AA_SERVER and !OpenTox::Authorization.authorized?(cv_uri,"GET",subjectid) - cv = Validation::Crossvalidation.get(cv_id) - raise OpenTox::NotFoundError.new "crossvalidation with id "+crossvalidation_id.to_s+" not found" unless cv - raise OpenTox::BadRequestError.new "crossvalidation with id "+crossvalidation_id.to_s+" is not finished yet" unless cv.finished - v = Validation::Validation.from_cv_statistics(cv_id, subjectid) + + if same_service?cv_uri + cv_id = cv_uri.split("/")[-1] + raise OpenTox::NotAuthorizedError.new "Not authorized: GET "+cv_uri.to_s if + AA_SERVER and !OpenTox::Authorization.authorized?(cv_uri,"GET",subjectid) + cv = Validation::Crossvalidation.get(cv_id) + raise OpenTox::NotFoundError.new "crossvalidation with id "+crossvalidation_id.to_s+" not found" unless cv + raise OpenTox::BadRequestError.new "crossvalidation with id "+crossvalidation_id.to_s+" is not finished yet" unless cv.finished + v = Validation::Validation.from_cv_statistics(cv_id, subjectid) + else + cv = YAML::load(OpenTox::RestClientWrapper.get cv_uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) + v = YAML::load(OpenTox::RestClientWrapper.get cv_uri+"/statistics", {:subjectid=>subjectid, :accept=>"application/serialize"}) + end (Validation::VAL_PROPS + Validation::VAL_CV_PROPS).each do |p| validation.send("#{p.to_s}=".to_sym, v.send(p)) end @@ -84,20 +109,25 @@ class Reports::ValidationDB validation.send("#{p.to_s}=".to_sym, cv.send(p.to_s)) end validation.crossvalidation_uri = cv_uri + validation.validation_uri = cv_uri+"/statistics" end - def init_cv(validation) - - #cv = Lib::Crossvalidation.find(validation.crossvalidation_id) - cv = Validation::Crossvalidation.get(validation.crossvalidation_id) - raise OpenTox::BadRequestError.new "no crossvalidation found with id "+validation.crossvalidation_id.to_s unless cv + def init_cv(validation, subjectid) + cv = nil + if same_service?validation.crossvalidation_uri + cv = Validation::Crossvalidation.get(validation.crossvalidation_id) + raise OpenTox::BadRequestError.new "no crossvalidation found with id "+validation.crossvalidation_id.to_s unless cv + else + cv = YAML::load(OpenTox::RestClientWrapper.get validation.crossvalidation_uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) + end Validation::CROSS_VAL_PROPS.each do |p| - validation.send("#{p.to_s}=".to_sym, cv.send(p.to_s)) + validation.send("#{p.to_s}=".to_sym, cv.send(p.to_s)) end end def get_predictions(validation, subjectid=nil, task=nil) + Lib::OTPredictions.new( validation.feature_type, validation.test_dataset_uri, validation.test_target_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri, validation.predicted_variable, validation.predicted_confidence, subjectid, task) diff --git a/report/validation_data.rb b/report/validation_data.rb index f5ecae7..61761ab 100755 --- a/report/validation_data.rb +++ b/report/validation_data.rb @@ -148,7 +148,7 @@ module Reports # loads all crossvalidation attributes, of the corresponding cv into this object def load_cv_attributes raise "crossvalidation-id not set" unless @crossvalidation_id - Reports.validation_access.init_cv(self) + Reports.validation_access.init_cv(self, @subjectid) # load cv report ids = Reports.persistance.list_reports("crossvalidation",{:crossvalidation=>self.crossvalidation_uri.to_s }) @crossvalidation_report_uri = ReportService.instance.get_uri("crossvalidation",ids[-1]) if ids and ids.size>0 -- cgit v1.2.3 From cf60c03db2481d3816e63f058a7ed12d905ac833 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Fri, 25 Nov 2011 09:07:50 +0100 Subject: add r-square plot, fix prediction updating, add weighted sample-correlation-coefficient --- lib/predictions.rb | 173 +++++++++++++++++++++++++++++------ report/plot_factory.rb | 161 ++++++++++++++------------------ report/report_content.rb | 8 +- report/report_factory.rb | 24 +++-- validation/validation_application.rb | 32 ------- 5 files changed, 235 insertions(+), 163 deletions(-) diff --git a/lib/predictions.rb b/lib/predictions.rb index 6c0e996..56bdd22 100755 --- a/lib/predictions.rb +++ b/lib/predictions.rb @@ -25,9 +25,6 @@ module Lib feature_type, accept_values=nil ) - @predicted_values = predicted_values - @actual_values = actual_values - @confidence_values = confidence_values @feature_type = feature_type @accept_values = accept_values @num_classes = 1 @@ -38,34 +35,27 @@ module Lib raise "unknown feature_type: '"+@feature_type.to_s+"'" unless @feature_type=="classification" || @feature_type=="regression" - raise "no predictions" if @predicted_values.size == 0 - num_info = "predicted:"+@predicted_values.size.to_s+ - " confidence:"+@confidence_values.size.to_s+" actual:"+@actual_values.size.to_s - raise "illegal num actual values "+num_info if @actual_values.size != @predicted_values.size - raise "illegal num confidence values "+num_info if @confidence_values.size != @predicted_values.size - - @confidence_values.each{ |c| raise "illegal confidence value: '"+c.to_s+"'" unless c==nil or (c.is_a?(Numeric) and c>=0 and c<=1) } + raise "no predictions" if predicted_values.size == 0 + num_info = "predicted:"+predicted_values.size.to_s+ + " confidence:"+confidence_values.size.to_s+" actual:"+actual_values.size.to_s + raise "illegal num actual values "+num_info if actual_values.size != predicted_values.size + raise "illegal num confidence values "+num_info if confidence_values.size != predicted_values.size case @feature_type when "classification" raise "accept_values missing while performing classification" unless @accept_values @num_classes = @accept_values.size raise "num classes < 2" if @num_classes<2 - { "predicted"=>@predicted_values, "actual"=>@actual_values }.each do |s,values| - values.each{ |v| raise "illegal "+s+" classification-value ("+v.to_s+"),"+ - "has to be either nil or index of predicted-values" if v!=nil and (!v.is_a?(Numeric) or v<0 or v>@num_classes)} - end when "regression" raise "accept_values != nil while performing regression" if @accept_values - { "predicted"=>@predicted_values, "actual"=>@actual_values }.each do |s,values| - values.each{ |v| raise "illegal "+s+" regression-value ("+v.to_s+"),"+ - " has to be either nil or number (not NaN, not Infinite)" unless v==nil or (v.is_a?(Numeric) and !v.nan? and v.finite?)} - end end + @predicted_values = [] + @actual_values = [] + @confidence_values = [] init_stats() - (0..@predicted_values.size-1).each do |i| - update_stats( @predicted_values[i], @actual_values[i], @confidence_values[i] ) + (0..predicted_values.size-1).each do |i| + update_stats( predicted_values[i], actual_values[i], confidence_values[i] ) end end @@ -114,6 +104,13 @@ module Lib @sum_squares_actual = 0 @sum_squares_predicted = 0 + @sum_confidence = 0 + @weighted_sum_actual = 0 + @weighted_sum_predicted = 0 + @weighted_sum_multiply = 0 + @weighted_sum_squares_actual = 0 + @weighted_sum_squares_predicted = 0 + @sum_weighted_abs_error = 0 @sum_weighted_squared_error = 0 end @@ -121,6 +118,25 @@ module Lib def update_stats( predicted_value, actual_value, confidence_value ) + raise "illegal confidence value: '"+confidence_value.to_s+"'" unless + confidence_value==nil or (confidence_value.is_a?(Numeric) and confidence_value>=0 and confidence_value<=1) + case @feature_type + when "classification" + { "predicted"=>predicted_value, "actual"=>actual_value }.each do |s,v| + raise "illegal "+s+" classification-value ("+v.to_s+"),"+ + "has to be either nil or index of predicted-values" if v!=nil and (!v.is_a?(Numeric) or v<0 or v>@num_classes) + end + when "regression" + { "predicted"=>predicted_value, "actual"=>actual_value }.each do |s,v| + raise "illegal "+s+" regression-value ("+v.to_s+"),"+ + " has to be either nil or number (not NaN, not Infinite)" unless v==nil or (v.is_a?(Numeric) and !v.nan? and v.finite?) + end + end + + @predicted_values << predicted_value + @actual_values << actual_value + @confidence_values << confidence_value + if actual_value==nil @num_no_actual_value += 1 else @@ -165,6 +181,16 @@ module Lib @sum_multiply += (actual_value*predicted_value) @sum_squares_actual += actual_value**2 @sum_squares_predicted += predicted_value**2 + + if @conf_provided + w_a = actual_value * confidence_value + w_p = predicted_value * confidence_value + @weighted_sum_actual += w_a + @weighted_sum_predicted += w_p + @weighted_sum_multiply += (w_a*w_p) if @conf_provided + @weighted_sum_squares_actual += w_a**2 if @conf_provided + @weighted_sum_squares_predicted += w_p**2 if @conf_provided + end end end end @@ -514,7 +540,7 @@ module Lib return @sum_squared_error end - def r_square + def r_square #_old #return sample_correlation_coefficient ** 2 # see http://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions @@ -525,7 +551,7 @@ module Lib ( r_2.infinite? || r_2.nan? ) ? 0 : r_2 end - def weighted_r_square + def weighted_r_square #_old return 0 unless confidence_values_available? ss_tot = weighted_total_sum_of_squares return 0 if ss_tot==0 @@ -533,6 +559,16 @@ module Lib ( r_2.infinite? || r_2.nan? ) ? 0 : r_2 end + #def r_square + # # as implemted in R + # return sample_correlation_coefficient ** 2 + #end + + #def weighted_r_square + # # as implemted in R + # return weighted_sample_correlation_coefficient ** 2 + #end + def sample_correlation_coefficient begin # formula see http://en.wikipedia.org/wiki/Correlation_and_dependence#Pearson.27s_product-moment_coefficient @@ -543,6 +579,16 @@ module Lib rescue; 0; end end + def weighted_sample_correlation_coefficient + begin + # formula see http://en.wikipedia.org/wiki/Correlation_and_dependence#Pearson.27s_product-moment_coefficient + scc = ( @num_predicted * @weighted_sum_multiply - @weighted_sum_actual * @weighted_sum_predicted ) / + ( Math.sqrt( @num_predicted * @weighted_sum_squares_actual - @weighted_sum_actual**2 ) * + Math.sqrt( @num_predicted * @weighted_sum_squares_predicted - @weighted_sum_predicted**2 ) ) + ( scc.infinite? || scc.nan? ) ? 0 : scc + rescue; 0; end + end + def total_sum_of_squares #return @variance_actual * ( @num_predicted - 1 ) sum = 0 @@ -608,17 +654,23 @@ module Lib return h end - def get_prediction_values(actual_accept_value, predicted_accept_value) + def get_prediction_values(performance_attr, performance_accept_value) #puts "get_roc_values for class_value: "+class_value.to_s raise "no confidence values" unless confidence_values_available? #raise "no class-value specified" if class_value==nil + actual_accept_value = nil + predicted_accept_value = nil + if performance_attr==:true_positive_rate + actual_accept_value = performance_accept_value + elsif performance_attr==:positive_predictive_value + predicted_accept_value = performance_accept_value + end actual_class_index = @accept_values.index(actual_accept_value) if actual_accept_value!=nil raise "class not found '"+actual_accept_value.to_s+"' in "+@accept_values.inspect if (actual_accept_value!=nil && actual_class_index==nil) - predicted_class_index = @accept_values.index(predicted_accept_value) if predicted_accept_value!=nil - raise "class not found "+predicted_accept_value.to_s+" in "+@accept_values.inspect if (predicted_accept_value!=nil && predicted_class_index==nil) + raise "class not found '"+predicted_accept_value.to_s+"' in "+@accept_values.inspect if (predicted_accept_value!=nil && predicted_class_index==nil) c = []; p = []; a = [] (0..@predicted_values.size-1).each do |i| @@ -697,6 +749,67 @@ module Lib #end private + def self.test_update + p=[0.4,0.2,0.3,0.5,0.8] + a=[0.45,0.21,0.25,0.55,0.75] + c = Array.new(p.size) + pred = Predictions.new(p,a,c,"regression") + puts pred.r_square + + pred = nil + p.size.times do |i| + if pred==nil + pred = Predictions.new([p[0]],[a[0]],[c[0]],"regression") + else + pred.update_stats(p[i],a[i],c[i]) + end + puts pred.r_square + end + end + + def self.test_r_square + require "rubygems" + require "opentox-ruby" + + max_deviation = rand * 0.9 + avg_deviation = max_deviation * 0.5 + + p = [] + a = [] + c = [] + (100 + rand(1000)).times do |i| + r = rand + deviation = rand * max_deviation + a << r + p << r + ((rand<0.5 ? -1 : 1) * deviation) + #c << 0.5 + if (deviation > avg_deviation) + c << 0.4 + else + c << 0.6 + end + #puts a[-1].to_s+" "+p[-1].to_s + end + puts "num values "+p.size.to_s + + pred = Predictions.new(p,a,c,"regression") + puts "internal" + #puts "r-square old "+pred.r_square_old.to_s + puts "cor "+pred.sample_correlation_coefficient.to_s + puts "weighted cor "+pred.weighted_sample_correlation_coefficient.to_s + puts "r-square "+pred.r_square.to_s + + puts "R" + @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r + @@r.assign "v1",a + @@r.assign "v2",p + puts "r cor "+@@r.pull("cor(v1,v2)").to_s + @@r.eval "fit <- lm(v1 ~ v2)" + @@r.eval "sum <- summary(fit)" + puts "r r-square "+@@r.pull("sum$r.squared").to_s + puts "r adjusted-r-square "+@@r.pull("sum$adj.r.squared").to_s + end + def prediction_feature_value_map(proc) res = {} (0..@num_classes-1).each do |i| @@ -706,4 +819,12 @@ module Lib end end -end \ No newline at end of file +end + +#class Float +# def to_s +# "%.5f" % self +# end +#end +##Lib::Predictions.test_update +#Lib::Predictions.test_r_square diff --git a/report/plot_factory.rb b/report/plot_factory.rb index 2074ce5..6083d26 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -2,6 +2,10 @@ ENV['JAVA_HOME'] = "/usr/bin" unless ENV['JAVA_HOME'] ENV['PATH'] = ENV['JAVA_HOME']+":"+ENV['PATH'] unless ENV['PATH'].split(":").index(ENV['JAVA_HOME']) ENV['RANK_PLOTTER_JAR'] = "RankPlotter/RankPlotter.jar" unless ENV['RANK_PLOTTER_JAR'] +CONF_PLOT_RANGE = { :accuracy => [0.45,1.05], :true_positive_rate => [0.45,1.05],:true_negative_rate => [0.45,1.05], + :false_positive_rate => [0.45,1.05], :false_negative_rate => [0.45,1.05], :positive_predictive_value => [0.45,1.05], + :negative_predictive_value => [0.45,1.05], :r_square => [0, 1.05], :sample_correlation_coefficient => [0, 1.05] } + class Array def swap!(i,j) tmp = self[i] @@ -47,7 +51,6 @@ class Array end end - module Reports module PlotFactory @@ -81,9 +84,11 @@ module Reports y_i = valid_indices.collect{ |i| y_i[i] } end - names << ( name_attribute==:crossvalidation_fold ? "fold " : "" ) + v.send(name_attribute).to_s - x << x_i - y << y_i + if x_i.size>0 + names << ( name_attribute==:crossvalidation_fold ? "fold " : "" ) + v.send(name_attribute).to_s + x << x_i + y << y_i + end end names = [""] if names.size==1 @@ -130,31 +135,22 @@ module Reports end end - def self.confidence_plot_class_performance( validation_set, actual_accept_value, predicted_accept_value ) + def self.confidence_plot_class_performance( validation_set, performance_attribute, performance_accept_value ) true_class = nil - if actual_accept_value==nil and predicted_accept_value==nil - perf = "Accuracy" - elsif actual_accept_value!=nil - if validation_set.get_true_accept_value==actual_accept_value - perf = "True Positive Rate" - true_class = actual_accept_value - elsif validation_set.get_accept_values.size==2 and validation_set.get_true_accept_value==(validation_set.get_accept_values-[actual_accept_value])[0] - perf = "True Negative Rate" + if performance_accept_value==nil + perf = performance_attribute.to_s.nice_attr + else + invert_true_class = (validation_set.get_accept_values.size==2 and + validation_set.get_true_accept_value==(validation_set.get_accept_values-[performance_accept_value])[0]) + if invert_true_class && performance_attribute==:true_positive_rate + perf = :true_negative_rate.to_s.nice_attr true_class = validation_set.get_true_accept_value - else - perf = "True Positive Rate" - true_class = actual_accept_value - end - elsif predicted_accept_value!=nil - if validation_set.get_true_accept_value==predicted_accept_value - perf = "Positive Predictive Value" - true_class = predicted_accept_value - elsif validation_set.get_accept_values.size==2 and validation_set.get_true_accept_value==(validation_set.get_accept_values-[predicted_accept_value])[0] - perf = "Negative Predictive Value" + elsif invert_true_class && performance_attribute==:positive_predictive_value + perf = :negative_predictive_value.to_s.nice_attr true_class = validation_set.get_true_accept_value else - perf = "Positive Predictive Value" - true_class = predicted_accept_value + perf = performance_attribute.to_s.nice_attr + true_class = performance_accept_value end end title = perf+" vs Confidence Plot" @@ -162,12 +158,8 @@ module Reports {:title =>title, :performance => perf} end - - def self.create_confidence_plot( out_files, validation_set, actual_accept_value = nil, - predicted_accept_value = nil, split_set_attribute=nil, show_single_curves=false ) + def self.create_confidence_plot( out_files, validation_set, performance_attribute, performance_accept_value, split_set_attribute=nil, show_single_curves=false ) - raise "param combination not supported" if actual_accept_value!=nil and predicted_accept_value!=nil - out_files = [out_files] unless out_files.is_a?(Array) LOGGER.debug "creating confidence plot for '"+validation_set.size.to_s+"' validations, out-file:"+out_files.inspect @@ -178,7 +170,7 @@ module Reports performance = [] attribute_values.each do |value| begin - data = transform_confidence_predictions(validation_set.filter({split_set_attribute => value}), actual_accept_value, predicted_accept_value, false) + data = transform_confidence_predictions(validation_set.filter({split_set_attribute => value}), performance_attribute, performance_accept_value, false) names << split_set_attribute.to_s.nice_attr+" "+value.to_s confidence << data[:confidence][0] performance << data[:performance][0] @@ -186,31 +178,21 @@ module Reports LOGGER.warn "could not create confidence plot for "+value.to_s end end - #RubyPlot::plot_lines(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", names, fp_rates, tp_rates ) out_files.each do |out_file| - case validation_set.unique_feature_type - when "classification" - info = confidence_plot_class_performance( validation_set, actual_accept_value, predicted_accept_value ) - RubyPlot::accuracy_confidence_plot(out_file, info[:title], "Confidence", info[:performance], names, confidence, performance) - when "regression" - RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", names, confidence, performance, true) - end + info = confidence_plot_class_performance( validation_set, performance_attribute, performance_accept_value ) + RubyPlot::confidence_plot(out_file, info[:title], "Confidence", info[:performance], + names, confidence, performance, CONF_PLOT_RANGE[performance_attribute]) end else - data = transform_confidence_predictions(validation_set, actual_accept_value, predicted_accept_value, show_single_curves) - out_files.each do |out_file| - case validation_set.unique_feature_type - when "classification" - info = confidence_plot_class_performance( validation_set, actual_accept_value, predicted_accept_value ) - RubyPlot::accuracy_confidence_plot(out_file, info[:title], "Confidence", info[:performance], data[:names], data[:confidence], data[:performance]) - when "regression" - RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", data[:names], data[:confidence], data[:performance], true) - end + data = transform_confidence_predictions(validation_set, performance_attribute, performance_accept_value, show_single_curves) + out_files.each do |out_file| + info = confidence_plot_class_performance( validation_set, performance_attribute, performance_accept_value ) + RubyPlot::confidence_plot(out_file, info[:title], "Confidence", info[:performance], + data[:names], data[:confidence], data[:performance], CONF_PLOT_RANGE[performance_attribute]) end end end - def self.create_bar_plot( out_files, validation_set, title_attribute, value_attributes ) out_files = [out_files] unless out_files.is_a?(Array) @@ -349,7 +331,11 @@ module Reports end - def self.transform_confidence_predictions(validation_set, actual_accept_value, predicted_accept_value, add_single_folds=false) + + def self.transform_confidence_predictions(validation_set, performance_attribute, performance_accept_value, add_single_folds) + + feature_type = validation_set.unique_feature_type + accept_values = validation_set.unique_feature_type=="classification" ? validation_set.get_accept_values : nil if (validation_set.size > 1) @@ -357,34 +343,37 @@ module Reports sum_confidence_values = { :predicted_values => [], :actual_values => [], :confidence_values => []} (0..validation_set.size-1).each do |i| - confidence_values = validation_set.get(i).get_predictions.get_prediction_values(actual_accept_value, predicted_accept_value) + confidence_values = validation_set.get(i).get_predictions.get_prediction_values(performance_attribute, performance_accept_value) sum_confidence_values[:predicted_values] += confidence_values[:predicted_values] sum_confidence_values[:confidence_values] += confidence_values[:confidence_values] sum_confidence_values[:actual_values] += confidence_values[:actual_values] if add_single_folds begin - pref_conf_rates = get_performance_confidence_rates(confidence_values) + perf_conf_rates = get_performance_confidence_rates(confidence_values, performance_attribute, performance_accept_value, + feature_type, accept_values) names << "fold "+i.to_s - performance << pref_conf_rates[:performance] - confidence << pref_conf_rates[:confidence] + performance << perf_conf_rates[:performance] + confidence << perf_conf_rates[:confidence] faint << true rescue LOGGER.warn "could not get confidence vals for fold "+i.to_s end end end - pref_conf_rates = get_performance_confidence_rates(sum_confidence_values, validation_set.unique_feature_type) + perf_conf_rates = get_performance_confidence_rates(sum_confidence_values, performance_attribute, performance_accept_value, + feature_type, accept_values) names << nil # "all" - performance << pref_conf_rates[:performance] - confidence << pref_conf_rates[:confidence] + performance << perf_conf_rates[:performance] + confidence << perf_conf_rates[:confidence] faint << false return { :names => names, :performance => performance, :confidence => confidence, :faint => faint } else - confidence_values = validation_set.validations[0].get_predictions.get_prediction_values(actual_accept_value, predicted_accept_value) - pref_conf_rates = get_performance_confidence_rates(confidence_values, validation_set.unique_feature_type) - return { :names => [""], :performance => [pref_conf_rates[:performance]], :confidence => [pref_conf_rates[:confidence]] } + confidence_values = validation_set.validations[0].get_predictions.get_prediction_values(performance_attribute, performance_accept_value) + perf_conf_rates = get_performance_confidence_rates(confidence_values, performance_attribute, performance_accept_value, + feature_type, accept_values) + return { :names => [""], :performance => [perf_conf_rates[:performance]], :confidence => [perf_conf_rates[:confidence]] } end end @@ -408,11 +397,11 @@ module Reports "True Positive Rate", plot_data ) end - def self.get_performance_confidence_rates(roc_values, feature_type) + def self.get_performance_confidence_rates(pred_values, performance_attribute, performance_accept_value, feature_type, accept_values) - c = roc_values[:confidence_values] - p = roc_values[:predicted_values] - a = roc_values[:actual_values] + c = pred_values[:confidence_values] + p = pred_values[:predicted_values] + a = pred_values[:actual_values] raise "no prediction values for confidence plot" if p.size==0 (0..p.size-2).each do |i| @@ -425,40 +414,26 @@ module Reports end end #puts c.inspect+"\n"+a.inspect+"\n"+p.inspect+"\n\n" - perf = [] conf = [] - - case feature_type - when "classification" - count = 0 - correct = 0 - (0..p.size-1).each do |i| - count += 1 - correct += 1 if p[i]==a[i] - if i>0 && (c[i]>=conf[-1]-0.00001) - perf.pop - conf.pop - end - perf << correct/count.to_f * 100 - conf << c[i] + predictions = nil + (0..p.size-1).each do |i| + # melt nearly identical confidence values to get a smoother graph + if i>0 && (c[i]>=conf[-1]-0.00001) + perf.pop + conf.pop end - when "regression" - count = 0 - sum_squared_error = 0 - (0..p.size-1).each do |i| - count += 1 - sum_squared_error += (p[i]-a[i])**2 - if i>0 && (c[i]>=conf[-1]-0.00001) - perf.pop - conf.pop - end - perf << Math.sqrt(sum_squared_error/count.to_f) - conf << c[i] + if (predictions == nil) + predictions = Lib::Predictions.new([p[i]],[a[i]],[c[i]],feature_type, accept_values) + else + predictions.update_stats(p[i], a[i], c[i]) end + + val = predictions.send(performance_attribute) + val = val[performance_accept_value] if val.is_a?(Hash) + perf << val + conf << c[i] end - #puts perf.inspect - return {:performance => perf,:confidence => conf} end diff --git a/report/report_content.rb b/report/report_content.rb index 8d6d44b..61db340 100755 --- a/report/report_content.rb +++ b/report/report_content.rb @@ -156,6 +156,7 @@ class Reports::ReportContent section_text += "\nWARNING: regression plot information not available for all validation results" if prediction_set.size!=validation_set.size @xml_report.add_paragraph(section_regr, section_text) if section_text + begin log_str = (log ? "_log" : "") plot_png = add_tmp_file("regr_plot"+log_str, "png") @@ -213,8 +214,8 @@ class Reports::ReportContent end def add_confidence_plot( validation_set, - actual_accept_value = nil, - predicted_accept_value = nil, + performance_attribute, + performance_accept_value, split_set_attribute = nil, image_title = "Confidence Plot", section_text="") @@ -234,7 +235,8 @@ class Reports::ReportContent begin plot_png = add_tmp_file("conf_plot", "png") plot_svg = add_tmp_file("conf_plot", "svg") - Reports::PlotFactory.create_confidence_plot( [plot_png[:path], plot_svg[:path]], prediction_set, actual_accept_value, predicted_accept_value, split_set_attribute, false ) + Reports::PlotFactory.create_confidence_plot( [plot_png[:path], plot_svg[:path]], prediction_set, performance_attribute, + performance_accept_value, split_set_attribute, false ) @xml_report.add_imagefigure(section_conf, image_title, plot_png[:name], "PNG", 100, plot_svg[:name]) rescue Exception => ex msg = "WARNING could not create confidence plot: "+ex.message diff --git a/report/report_factory.rb b/report/report_factory.rb index 9995b42..484cf12 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -85,10 +85,10 @@ module Reports::ReportFactory report.align_last_two_images "ROC Plots" end end - report.add_confidence_plot(validation_set) + report.add_confidence_plot(validation_set, :accuracy, nil) validation_set.get_accept_values.each do |accept_value| - report.add_confidence_plot(validation_set, accept_value, nil) - report.add_confidence_plot(validation_set, nil, accept_value) + report.add_confidence_plot(validation_set, :true_positive_rate, accept_value) + report.add_confidence_plot(validation_set, :positive_predictive_value, accept_value) report.align_last_two_images "Confidence Plots" end report.end_section @@ -96,7 +96,9 @@ module Reports::ReportFactory report.add_result(validation_set, [:validation_uri] + VAL_ATTR_TRAIN_TEST + VAL_ATTR_REGR, "Results", "Results") report.add_section("Plots") report.add_regression_plot(validation_set, :model_uri) - report.add_confidence_plot(validation_set) + report.add_confidence_plot(validation_set, :root_mean_squared_error, nil) + report.add_confidence_plot(validation_set, :r_square, nil) + report.align_last_two_images "Confidence Plots" report.end_section end task.progress(90) if task @@ -146,10 +148,10 @@ module Reports::ReportFactory report.align_last_two_images "ROC Plots" end end - report.add_confidence_plot(validation_set,nil,nil,split_attribute) + report.add_confidence_plot(validation_set,:accuracy,nil,split_attribute) validation_set.get_accept_values.each do |accept_value| - report.add_confidence_plot(validation_set, accept_value, nil,split_attribute) - report.add_confidence_plot(validation_set, nil, accept_value,split_attribute) + report.add_confidence_plot(validation_set, :true_positive_rate, accept_value, split_attribute) + report.add_confidence_plot(validation_set, :positive_predictive_value, accept_value, split_attribute) report.align_last_two_images "Confidence Plots" end end @@ -160,8 +162,12 @@ module Reports::ReportFactory report.add_result(cv_set, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],res_titel, res_titel, res_text) report.add_section("Plots") report.add_regression_plot(validation_set, :crossvalidation_fold) - report.add_confidence_plot(validation_set) - report.add_confidence_plot(validation_set, nil, :crossvalidation_fold) + report.add_confidence_plot(validation_set, :root_mean_squared_error, nil) + report.add_confidence_plot(validation_set, :r_square, nil) + report.align_last_two_images "Confidence Plots" + report.add_confidence_plot(validation_set, :root_mean_squared_error, nil, :crossvalidation_fold) + report.add_confidence_plot(validation_set, :r_square, nil, :crossvalidation_fold) + report.align_last_two_images "Confidence Plots Across Folds" report.end_section report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") end diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 4b6763a..4b2a2d9 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -570,38 +570,6 @@ post '/validate_datasets' do return_task(task) end -get '/:id/verify_r_square' do - - #PENDING: this is debug code, move to test-suite - - validation = Validation::Validation.get(params[:id]) - p = validation.compute_validation_stats_with_model(nil, true) - - puts "actual "+p.actual_values.inspect - puts "predicted "+p.predicted_values.inspect - puts "" - - puts "ot r-square "+p.r_square.to_s - puts "ot sample_correlation_coefficient "+p.sample_correlation_coefficient.to_s - puts "ot sample_correlation_coefficient**2 "+(p.sample_correlation_coefficient**2).to_s - puts "" - - @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r - @@r.assign "v1",p.actual_values - @@r.assign "v2",p.predicted_values - puts "r cor "+@@r.pull("cor(v1,v2)").to_s - # @@r.eval "ttest = t.test(v1,v2,paired=T)" - # t = @@r.pull "ttest$statistic" - @@r.eval "fit <- lm(v1 ~ v2)" - @@r.eval "sum <- summary(fit)" - puts "r r-square "+@@r.pull("sum$r.squared").to_s - puts "r adjusted-r-square "+@@r.pull("sum$adj.r.squared").to_s - - @@r.quit - @@r = nil - -end - get '/:id/predictions' do LOGGER.info "get validation predictions "+params.inspect begin -- cgit v1.2.3 From f00b9f12cf9dbf40eaf3d55881e50acf53776013 Mon Sep 17 00:00:00 2001 From: davor Date: Wed, 30 Nov 2011 12:16:33 +0100 Subject: Updated ruby-plot version to 0.6.0 --- report/environment.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/report/environment.rb b/report/environment.rb index 72320a0..34554f7 100755 --- a/report/environment.rb +++ b/report/environment.rb @@ -1,10 +1,9 @@ - ['rubygems', 'logger', 'fileutils', 'sinatra', 'sinatra/url_for', 'rest_client', 'yaml', 'fileutils', 'mime/types', 'abbrev', 'rexml/document', 'ruby-plot', 'opentox-ruby' ].each do |g| require g end -gem 'ruby-plot', "~>0.5.0" +gem 'ruby-plot', "~>0.6.0" module Reports end -- cgit v1.2.3 From 7565aeb930c9b24a677b65b89d62cc2db6318cee Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 7 Dec 2011 16:26:41 +0100 Subject: add computation of classification prediciton probabilities --- lib/ot_predictions.rb | 270 +++++++++++++++++++---------------- lib/predictions.rb | 37 ++++- lib/validation_db.rb | 1 + validation/validation_application.rb | 25 ++++ validation/validation_service.rb | 30 +++- 5 files changed, 236 insertions(+), 127 deletions(-) diff --git a/lib/ot_predictions.rb b/lib/ot_predictions.rb index 335fe84..cf0168e 100755 --- a/lib/ot_predictions.rb +++ b/lib/ot_predictions.rb @@ -8,155 +8,175 @@ module Lib CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/ def identifier(instance_index) - return compound(instance_index) + compound(instance_index) end def compound(instance_index) - return @compounds[instance_index] + @compounds[instance_index] end - + def initialize( feature_type, test_dataset_uris, test_target_dataset_uris, - prediction_feature, prediction_dataset_uris, predicted_variables, predicted_confidences, subjectid=nil, task=nil) + prediction_feature, prediction_dataset_uris, predicted_variables, predicted_confidences, + subjectid=nil, task=nil ) + + test_dataset_uris = [test_dataset_uris] unless test_dataset_uris.is_a?(Array) + test_target_dataset_uris = [test_target_dataset_uris] unless test_target_dataset_uris.is_a?(Array) + prediction_dataset_uris = [prediction_dataset_uris] unless prediction_dataset_uris.is_a?(Array) + predicted_variables = [predicted_variables] unless predicted_variables.is_a?(Array) + predicted_confidences = [predicted_confidences] unless predicted_confidences.is_a?(Array) + LOGGER.debug "loading prediction -- test-dataset: "+test_dataset_uris.inspect + LOGGER.debug "loading prediction -- test-target-datset: "+test_target_dataset_uris.inspect + LOGGER.debug "loading prediction -- prediction-dataset: "+prediction_dataset_uris.inspect + LOGGER.debug "loading prediction -- predicted_variable: "+predicted_variables.inspect + LOGGER.debug "loading prediction -- predicted_confidence: "+predicted_confidences.inspect + LOGGER.debug "loading prediction -- prediction_feature: "+prediction_feature.to_s + raise "prediction_feature missing" unless prediction_feature + + @compounds = [] + all_predicted_values = [] + all_actual_values = [] + all_confidence_values = [] + accept_values = nil - test_dataset_uris = [test_dataset_uris] unless test_dataset_uris.is_a?(Array) - test_target_dataset_uris = [test_target_dataset_uris] unless test_target_dataset_uris.is_a?(Array) - prediction_dataset_uris = [prediction_dataset_uris] unless prediction_dataset_uris.is_a?(Array) - predicted_variables = [predicted_variables] unless predicted_variables.is_a?(Array) - predicted_confidences = [predicted_confidences] unless predicted_confidences.is_a?(Array) - LOGGER.debug "loading prediction -- test-dataset: "+test_dataset_uris.inspect - LOGGER.debug "loading prediction -- test-target-datset: "+test_target_dataset_uris.inspect - LOGGER.debug "loading prediction -- prediction-dataset: "+prediction_dataset_uris.inspect - LOGGER.debug "loading prediction -- predicted_variable: "+predicted_variables.inspect - LOGGER.debug "loading prediction -- predicted_confidence: "+predicted_confidences.inspect - LOGGER.debug "loading prediction -- prediction_feature: "+prediction_feature.to_s - raise "prediction_feature missing" unless prediction_feature + if task + task_step = 100 / (test_dataset_uris.size*2 + 1) + task_status = 0 + end + + test_dataset_uris.size.times do |i| - @compounds = [] - all_predicted_values = [] - all_actual_values = [] - all_confidence_values = [] - accept_values = nil + test_dataset_uri = test_dataset_uris[i] + test_target_dataset_uri = test_target_dataset_uris[i] + prediction_dataset_uri = prediction_dataset_uris[i] + predicted_variable = predicted_variables[i] + predicted_confidence = predicted_confidences[i] - if task - task_step = 100 / (test_dataset_uris.size*2 + 1) - task_status = 0 + predicted_variable=prediction_feature if predicted_variable==nil + + test_dataset = Lib::DatasetCache.find test_dataset_uri,subjectid + raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset + + if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri + test_target_dataset_uri = test_dataset_uri + test_target_dataset = test_dataset + raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+ + "prediction_feature: '"+prediction_feature.to_s+"'\n"+ + "test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ + "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil + else + test_target_dataset = Lib::DatasetCache.find test_target_dataset_uri,subjectid + raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset + if CHECK_VALUES + test_dataset.compounds.each do |c| + raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c) + end + end + raise "prediction_feature not found in test_target_dataset\n"+ + "prediction_feature: '"+prediction_feature.to_s+"'\n"+ + "test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ + "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil end - - test_dataset_uris.size.times do |i| - - test_dataset_uri = test_dataset_uris[i] - test_target_dataset_uri = test_target_dataset_uris[i] - prediction_dataset_uri = prediction_dataset_uris[i] - predicted_variable = predicted_variables[i] - predicted_confidence = predicted_confidences[i] - - predicted_variable=prediction_feature if predicted_variable==nil - test_dataset = Lib::DatasetCache.find test_dataset_uri,subjectid - raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset + compounds = test_dataset.compounds + LOGGER.debug "test dataset size: "+compounds.size.to_s + raise "test dataset is empty "+test_dataset_uri.to_s unless compounds.size>0 - if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri - test_target_dataset_uri = test_dataset_uri - test_target_dataset = test_dataset - raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+ - "prediction_feature: '"+prediction_feature.to_s+"'\n"+ - "test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ - "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil + if feature_type=="classification" + av = test_target_dataset.accept_values(prediction_feature) + raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+ + test_target_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2 + if accept_values==nil + accept_values=av else - test_target_dataset = Lib::DatasetCache.find test_target_dataset_uri,subjectid - raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset - if CHECK_VALUES - test_dataset.compounds.each do |c| - raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c) - end - end - raise "prediction_feature not found in test_target_dataset\n"+ - "prediction_feature: '"+prediction_feature.to_s+"'\n"+ - "test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ - "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil + raise "accept values (in folds) differ "+av.inspect+" != "+accept_values.inspect if av!=accept_values end - - compounds = test_dataset.compounds - LOGGER.debug "test dataset size: "+compounds.size.to_s - raise "test dataset is empty "+test_dataset_uri.to_s unless compounds.size>0 - - if feature_type=="classification" - av = test_target_dataset.accept_values(prediction_feature) - raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+ - test_target_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2 - if accept_values==nil - accept_values=av - else - raise "accept values (in folds) differ "+av.inspect+" != "+accept_values.inspect if av!=accept_values - end + end + + actual_values = [] + compounds.each do |c| + case feature_type + when "classification" + actual_values << classification_val(test_target_dataset, c, prediction_feature, accept_values) + when "regression" + actual_values << regression_val(test_target_dataset, c, prediction_feature) end - - actual_values = [] - compounds.each do |c| + end + task.progress( task_status += task_step ) if task # loaded actual values + + prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid + raise "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset + + # allow missing prediction feature if there are no compounds in the prediction dataset + raise "predicted_variable not found in prediction_dataset\n"+ + "predicted_variable '"+predicted_variable.to_s+"'\n"+ + "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ + "available features are: "+prediction_dataset.features.inspect if prediction_dataset.features.keys.index(predicted_variable)==nil and prediction_dataset.compounds.size>0 + raise "predicted_confidence not found in prediction_dataset\n"+ + "predicted_confidence '"+predicted_confidence.to_s+"'\n"+ + "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ + "available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.features.keys.index(predicted_confidence)==nil and prediction_dataset.compounds.size>0 + + raise "more predicted than test compounds, #test: "+compounds.size.to_s+" < #prediction: "+ + prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+ + prediction_dataset_uri if compounds.size < prediction_dataset.compounds.size + if CHECK_VALUES + prediction_dataset.compounds.each do |c| + raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+ + compounds.collect{|c| c.to_s}.join("\n") if compounds.index(c)==nil + end + end + + predicted_values = [] + confidence_values = [] + count = 0 + compounds.each do |c| + if prediction_dataset.compounds.index(c)==nil + predicted_values << nil + confidence_values << nil + else case feature_type when "classification" - actual_values << classification_val(test_target_dataset, c, prediction_feature, accept_values) + predicted_values << classification_val(prediction_dataset, c, predicted_variable, accept_values) when "regression" - actual_values << regression_val(test_target_dataset, c, prediction_feature) + predicted_values << regression_val(prediction_dataset, c, predicted_variable) end - end - task.progress( task_status += task_step ) if task # loaded actual values - - prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid - raise "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset - - # allow missing prediction feature if there are no compounds in the prediction dataset - raise "predicted_variable not found in prediction_dataset\n"+ - "predicted_variable '"+predicted_variable.to_s+"'\n"+ - "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ - "available features are: "+prediction_dataset.features.inspect if prediction_dataset.features.keys.index(predicted_variable)==nil and prediction_dataset.compounds.size>0 - raise "predicted_confidence not found in prediction_dataset\n"+ - "predicted_confidence '"+predicted_confidence.to_s+"'\n"+ - "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ - "available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.features.keys.index(predicted_confidence)==nil and prediction_dataset.compounds.size>0 - - raise "more predicted than test compounds, #test: "+compounds.size.to_s+" < #prediction: "+ - prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+ - prediction_dataset_uri if compounds.size < prediction_dataset.compounds.size - if CHECK_VALUES - prediction_dataset.compounds.each do |c| - raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+ - compounds.collect{|c| c.to_s}.join("\n") if compounds.index(c)==nil - end - end - - predicted_values = [] - confidence_values = [] - count = 0 - compounds.each do |c| - if prediction_dataset.compounds.index(c)==nil - predicted_values << nil - confidence_values << nil + if predicted_confidence + confidence_values << confidence_val(prediction_dataset, c, predicted_confidence) else - case feature_type - when "classification" - predicted_values << classification_val(prediction_dataset, c, predicted_variable, accept_values) - when "regression" - predicted_values << regression_val(prediction_dataset, c, predicted_variable) - end - if predicted_confidence - confidence_values << confidence_val(prediction_dataset, c, predicted_confidence) - else - confidence_values << nil - end + confidence_values << nil end - count += 1 end - @compounds += compounds - all_predicted_values += predicted_values - all_actual_values += actual_values - all_confidence_values += confidence_values - - task.progress( task_status += task_step ) if task # loaded predicted values and confidence + count += 1 end + @compounds += compounds + all_predicted_values += predicted_values + all_actual_values += actual_values + all_confidence_values += confidence_values + task.progress( task_status += task_step ) if task # loaded predicted values and confidence + end + + #sort according to confidence if available + if all_confidence_values.compact.size>0 + values = [] + all_predicted_values.size.times do |i| + values << [all_predicted_values[i], all_actual_values[i], all_confidence_values[i], @compounds[i]] + end + values = values.sort_by{ |v| v[2] || 0 }.reverse # sorting by confidence + all_predicted_values = [] + all_actual_values = [] + all_confidence_values = [] + @compounds = [] + values.each do |v| + all_predicted_values << v[0] + all_actual_values << v[1] + all_confidence_values << v[2] + @compounds << v[3] + end + end + super(all_predicted_values, all_actual_values, all_confidence_values, feature_type, accept_values) - raise "illegal num compounds "+num_info if @compounds.size != @predicted_values.size + raise "illegal num compounds "+num_info if @compounds.size != @predicted_values.size task.progress(100) if task # done with the mathmatics end diff --git a/lib/predictions.rb b/lib/predictions.rb index 56bdd22..bd32efb 100755 --- a/lib/predictions.rb +++ b/lib/predictions.rb @@ -18,7 +18,29 @@ module Lib def identifier(instance_index) return instance_index.to_s end - + + def data + { :predicted_values => @predicted_values, :actual_values => @actual_values, :confidence_values => @confidence_values, + :feature_type => @feature_type, :accept_values => @accept_values } + end + + def self.from_data( data, min_confidence=nil, prediction_index=nil ) + if min_confidence!=nil + valid_indices = [] + data[:confidence_values].size.times do |i| + valid_indices << i if prediction_index==data[:predicted_values][i] and + (valid_indices.size<=12 or data[:confidence_values][i]>=min_confidence) + end + [ :predicted_values, :actual_values, :confidence_values ].each do |key| + arr = [] + valid_indices.each{|i| arr << data[key][i]} + data[key] = arr + end + end + Predictions.new( data[:predicted_values], data[:actual_values], data[:confidence_values], + data[:feature_type], data[:accept_values] ) + end + def initialize( predicted_values, actual_values, confidence_values, @@ -280,6 +302,15 @@ module Lib return res end + # returns acutal values for a certain prediction + def confusion_matrix_row(predicted_class_index) + r = [] + (0..@num_classes-1).each do |actual| + r << @confusion_matrix[actual][predicted_class_index] + end + return r + end + def area_under_roc(class_index=nil) return prediction_feature_value_map( lambda{ |i| area_under_roc(i) } ) if class_index==nil @@ -742,6 +773,10 @@ module Lib @conf_provided end + def min_confidence + @confidence_values[-1] + end + ################################################################################################################### #def compound(instance_index) diff --git a/lib/validation_db.rb b/lib/validation_db.rb index be004fb..f770dc2 100755 --- a/lib/validation_db.rb +++ b/lib/validation_db.rb @@ -72,6 +72,7 @@ module Validation attribute :classification_statistics_yaml attribute :regression_statistics_yaml attribute :finished + attribute :prediction_data index :model_uri index :validation_type diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 4b2a2d9..0647b10 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -190,6 +190,15 @@ get '/crossvalidation/:id/statistics' do end end +get '/crossvalidation/:id/statistics/probabilities' do + + LOGGER.info "get crossvalidation statistics for crossvalidation with id "+params[:id].to_s + v = Validation::Validation.from_cv_statistics( params[:id], @subjectid ) + props = v.probabilities(params[:confidence].to_s.to_f,params[:prediction].to_s) + content_type "text/x-yaml" + props.to_yaml +end + delete '/crossvalidation/:id/?' do LOGGER.info "delete crossvalidation with id "+params[:id].to_s content_type "text/plain" @@ -570,6 +579,22 @@ post '/validate_datasets' do return_task(task) end +get '/:id/probabilities' do + LOGGER.info "get validation probabilities "+params.inspect + + begin + validation = Validation::Validation.get(params[:id]) + rescue ActiveRecord::RecordNotFound => ex + raise OpenTox::NotFoundError.new "Validation '#{params[:id]}' not found." + end + validation.subjectid = @subjectid + raise OpenTox::BadRequestError.new "Validation '"+params[:id].to_s+"' not finished" unless validation.finished + props = validation.probabilities(params[:confidence].to_s.to_f,params[:prediction].to_s) + content_type "text/x-yaml" + props.to_yaml +end + + get '/:id/predictions' do LOGGER.info "get validation predictions "+params.inspect begin diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 8dc90e2..2b8a18f 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -72,6 +72,7 @@ module Validation v.crossvalidation_id = crossvalidation.id v.crossvalidation_fold = vals.collect{ |vv| vv.crossvalidation_fold }.uniq.join(";") v.real_runtime = vals.collect{ |vv| vv.real_runtime }.uniq.join(";") + v.prediction_data = prediction.data.to_yaml v.save end waiting_task.progress(100) if waiting_task @@ -236,7 +237,8 @@ module Validation LOGGER.debug "computing prediction stats" prediction = Lib::OTPredictions.new( feature_type, self.test_dataset_uri, self.test_target_dataset_uri, self.prediction_feature, - self.prediction_dataset_uri, predicted_variable, predicted_confidence, self.subjectid, OpenTox::SubTask.create(task, 0, 80) ) + self.prediction_dataset_uri, predicted_variable, predicted_confidence, self.subjectid, + OpenTox::SubTask.create(task, 0, 80) ) #reading datasets and computing the main stats is 80% the work unless dry_run @@ -261,6 +263,7 @@ module Validation :percent_without_class => prediction.percent_without_class, :num_unpredicted => prediction.num_unpredicted, :percent_unpredicted => prediction.percent_unpredicted, + :prediction_data => prediction.data.to_yaml, :finished => true raise unless self.valid? end @@ -268,6 +271,31 @@ module Validation task.progress(100) if task prediction end + + + def probabilities( confidence, prediction ) + raise OpenTox::BadRequestError.new "Only supported for classification" if classification_statistics==nil + raise OpenTox::BadRequestError.new("illegal confidence value #{confidence}") if !confidence.is_a?(Numeric) or confidence<0 or confidence>1 + + p_data = YAML.load(self.prediction_data.to_s) + raise OpenTox::BadRequestError.new("probabilities method works only for new validations - prediction data missing") unless p_data + raise OpenTox::BadRequestError.new("illegal prediction value: '"+prediction+"', available: "+ + p_data[:accept_values].inspect) if p_data[:accept_values].index(prediction)==nil + + p = Lib::Predictions.from_data(p_data, confidence, p_data[:accept_values].index(prediction)) + raise OpenTox::BadRequestError("no confidence values available") unless p.confidence_values_available? + + prediction_counts = p.confusion_matrix_row( p_data[:accept_values].index(prediction) ) + sum = 0 + prediction_counts.each{|v| sum+=v} + + probs = {} + p_data[:accept_values].size.times do |i| + probs[p_data[:accept_values][i]] = prediction_counts[i]/sum.to_f + end + probs + {:probs => probs, :num_predictions => sum, :min_confidence => p.min_confidence} + end end class Crossvalidation -- cgit v1.2.3 From d02b54b2c58d2d71e29700bbedbb38768d6c9e35 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 13 Dec 2011 11:20:04 +0100 Subject: add filtering of validation reports --- lib/ot_predictions.rb | 238 ++--------------------------- lib/prediction_data.rb | 287 +++++++++++++++++++++++++++++++++++ lib/predictions.rb | 49 ++---- lib/validation_db.rb | 10 +- report/plot_factory.rb | 132 ++++++++++++++-- report/report_content.rb | 6 + report/report_factory.rb | 34 +++-- report/report_service.rb | 10 +- report/statistical_test.rb | 22 +-- report/validation_access.rb | 25 ++- report/validation_data.rb | 26 ++-- test/test_examples_util.rb | 3 +- validation/validation_application.rb | 104 ++++++------- validation/validation_service.rb | 168 ++++++++++---------- 14 files changed, 658 insertions(+), 456 deletions(-) create mode 100644 lib/prediction_data.rb diff --git a/lib/ot_predictions.rb b/lib/ot_predictions.rb index cf0168e..3be845b 100755 --- a/lib/ot_predictions.rb +++ b/lib/ot_predictions.rb @@ -1,12 +1,17 @@ +require "lib/prediction_data.rb" require "lib/predictions.rb" module Lib class OTPredictions < Predictions - CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/ - + def initialize(data, compounds=nil) + raise unless data.is_a?(Hash) + super(data) + @compounds = compounds + end + def identifier(instance_index) compound(instance_index) end @@ -15,234 +20,9 @@ module Lib @compounds[instance_index] end - def initialize( feature_type, test_dataset_uris, test_target_dataset_uris, - prediction_feature, prediction_dataset_uris, predicted_variables, predicted_confidences, - subjectid=nil, task=nil ) - - test_dataset_uris = [test_dataset_uris] unless test_dataset_uris.is_a?(Array) - test_target_dataset_uris = [test_target_dataset_uris] unless test_target_dataset_uris.is_a?(Array) - prediction_dataset_uris = [prediction_dataset_uris] unless prediction_dataset_uris.is_a?(Array) - predicted_variables = [predicted_variables] unless predicted_variables.is_a?(Array) - predicted_confidences = [predicted_confidences] unless predicted_confidences.is_a?(Array) - LOGGER.debug "loading prediction -- test-dataset: "+test_dataset_uris.inspect - LOGGER.debug "loading prediction -- test-target-datset: "+test_target_dataset_uris.inspect - LOGGER.debug "loading prediction -- prediction-dataset: "+prediction_dataset_uris.inspect - LOGGER.debug "loading prediction -- predicted_variable: "+predicted_variables.inspect - LOGGER.debug "loading prediction -- predicted_confidence: "+predicted_confidences.inspect - LOGGER.debug "loading prediction -- prediction_feature: "+prediction_feature.to_s - raise "prediction_feature missing" unless prediction_feature - - @compounds = [] - all_predicted_values = [] - all_actual_values = [] - all_confidence_values = [] - accept_values = nil - - if task - task_step = 100 / (test_dataset_uris.size*2 + 1) - task_status = 0 - end - - test_dataset_uris.size.times do |i| - - test_dataset_uri = test_dataset_uris[i] - test_target_dataset_uri = test_target_dataset_uris[i] - prediction_dataset_uri = prediction_dataset_uris[i] - predicted_variable = predicted_variables[i] - predicted_confidence = predicted_confidences[i] - - predicted_variable=prediction_feature if predicted_variable==nil - - test_dataset = Lib::DatasetCache.find test_dataset_uri,subjectid - raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset - - if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri - test_target_dataset_uri = test_dataset_uri - test_target_dataset = test_dataset - raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+ - "prediction_feature: '"+prediction_feature.to_s+"'\n"+ - "test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ - "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil - else - test_target_dataset = Lib::DatasetCache.find test_target_dataset_uri,subjectid - raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset - if CHECK_VALUES - test_dataset.compounds.each do |c| - raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c) - end - end - raise "prediction_feature not found in test_target_dataset\n"+ - "prediction_feature: '"+prediction_feature.to_s+"'\n"+ - "test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ - "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil - end - - compounds = test_dataset.compounds - LOGGER.debug "test dataset size: "+compounds.size.to_s - raise "test dataset is empty "+test_dataset_uri.to_s unless compounds.size>0 - - if feature_type=="classification" - av = test_target_dataset.accept_values(prediction_feature) - raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+ - test_target_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2 - if accept_values==nil - accept_values=av - else - raise "accept values (in folds) differ "+av.inspect+" != "+accept_values.inspect if av!=accept_values - end - end - - actual_values = [] - compounds.each do |c| - case feature_type - when "classification" - actual_values << classification_val(test_target_dataset, c, prediction_feature, accept_values) - when "regression" - actual_values << regression_val(test_target_dataset, c, prediction_feature) - end - end - task.progress( task_status += task_step ) if task # loaded actual values - - prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid - raise "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset - - # allow missing prediction feature if there are no compounds in the prediction dataset - raise "predicted_variable not found in prediction_dataset\n"+ - "predicted_variable '"+predicted_variable.to_s+"'\n"+ - "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ - "available features are: "+prediction_dataset.features.inspect if prediction_dataset.features.keys.index(predicted_variable)==nil and prediction_dataset.compounds.size>0 - raise "predicted_confidence not found in prediction_dataset\n"+ - "predicted_confidence '"+predicted_confidence.to_s+"'\n"+ - "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ - "available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.features.keys.index(predicted_confidence)==nil and prediction_dataset.compounds.size>0 - - raise "more predicted than test compounds, #test: "+compounds.size.to_s+" < #prediction: "+ - prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+ - prediction_dataset_uri if compounds.size < prediction_dataset.compounds.size - if CHECK_VALUES - prediction_dataset.compounds.each do |c| - raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+ - compounds.collect{|c| c.to_s}.join("\n") if compounds.index(c)==nil - end - end - - predicted_values = [] - confidence_values = [] - count = 0 - compounds.each do |c| - if prediction_dataset.compounds.index(c)==nil - predicted_values << nil - confidence_values << nil - else - case feature_type - when "classification" - predicted_values << classification_val(prediction_dataset, c, predicted_variable, accept_values) - when "regression" - predicted_values << regression_val(prediction_dataset, c, predicted_variable) - end - if predicted_confidence - confidence_values << confidence_val(prediction_dataset, c, predicted_confidence) - else - confidence_values << nil - end - end - count += 1 - end - @compounds += compounds - all_predicted_values += predicted_values - all_actual_values += actual_values - all_confidence_values += confidence_values - - task.progress( task_status += task_step ) if task # loaded predicted values and confidence - end - - #sort according to confidence if available - if all_confidence_values.compact.size>0 - values = [] - all_predicted_values.size.times do |i| - values << [all_predicted_values[i], all_actual_values[i], all_confidence_values[i], @compounds[i]] - end - values = values.sort_by{ |v| v[2] || 0 }.reverse # sorting by confidence - all_predicted_values = [] - all_actual_values = [] - all_confidence_values = [] - @compounds = [] - values.each do |v| - all_predicted_values << v[0] - all_actual_values << v[1] - all_confidence_values << v[2] - @compounds << v[3] - end - end - - super(all_predicted_values, all_actual_values, all_confidence_values, feature_type, accept_values) - raise "illegal num compounds "+num_info if @compounds.size != @predicted_values.size - task.progress(100) if task # done with the mathmatics - end - - private - def regression_val(dataset, compound, feature) - v = value(dataset, compound, feature) - begin - v = v.to_f unless v==nil or v.is_a?(Numeric) - v - rescue - LOGGER.warn "no numeric value for regression: '"+v.to_s+"'" - nil - end - end - - def confidence_val(dataset, compound, confidence) - v = value(dataset, compound, confidence) - begin - v = v.to_f unless v==nil or v.is_a?(Numeric) - v - rescue - LOGGER.warn "no numeric value for confidence '"+v.to_s+"'" - nil - end - end - - def classification_val(dataset, compound, feature, accept_values) - v = value(dataset, compound, feature) - i = accept_values.index(v.to_s) - raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+ - accept_values.inspect unless v==nil or i!=nil - i - end - - def value(dataset, compound, feature) - return nil if dataset.data_entries[compound]==nil - if feature==nil - v = dataset.data_entries[compound].values[0] - else - v = dataset.data_entries[compound][feature] - end - return nil if v==nil - raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array) - if v.size>1 - v.uniq! - if v.size>1 - v = nil - LOGGER.warn "not yet implemented: multiple non-equal values "+compound.to_s+" "+v.inspect - else - v = v[0] - end - elsif v.size==1 - v = v[0] - else - v = nil - end - raise "array" if v.is_a?(Array) - v = nil if v.to_s.size==0 - v - end - - public - def compute_stats - + def compute_stats() res = {} - case @feature_type + case feature_type when "classification" (Validation::VAL_CLASS_PROPS).each{ |s| res[s] = send(s)} when "regression" diff --git a/lib/prediction_data.rb b/lib/prediction_data.rb new file mode 100644 index 0000000..154d11a --- /dev/null +++ b/lib/prediction_data.rb @@ -0,0 +1,287 @@ + +module Lib + + class PredictionData + + CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/ + + def self.filter_data( data, compounds, min_confidence, min_num_predictions, max_num_predictions, prediction_index=nil ) + + raise OpenTox::BadRequestError.new "please specify either min_confidence or max_num_predictions" if + (min_confidence!=nil and max_num_predictions!=nil) || (min_confidence==nil and max_num_predictions==nil) + raise OpenTox::BadRequestError.new "min_num_predictions only valid for min_confidence" if + (min_confidence==nil and min_num_predictions!=nil) + min_num_predictions = 0 if min_num_predictions==nil + + LOGGER.debug("filtering predictions, conf:'"+min_confidence.to_s+"' min_num_predictions: '"+ + min_num_predictions.to_s+"' max_num_predictions: '"+max_num_predictions.to_s+"' ") + + orig_size = data[:predicted_values].size + valid_indices = [] + data[:confidence_values].size.times do |i| + next if prediction_index!=nil and prediction_index!=data[:predicted_values][i] + valid = false + if min_confidence!=nil + valid = (valid_indices.size<=min_num_predictions or data[:confidence_values][i]>=min_confidence) + else + valid = valid_indices.size0 + + if feature_type=="classification" + av = test_target_dataset.accept_values(prediction_feature) + raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+ + test_target_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2 + if accept_values==nil + accept_values=av + else + raise "accept values (in folds) differ "+av.inspect+" != "+accept_values.inspect if av!=accept_values + end + end + + actual_values = [] + compounds.each do |c| + case feature_type + when "classification" + actual_values << classification_val(test_target_dataset, c, prediction_feature, accept_values) + when "regression" + actual_values << regression_val(test_target_dataset, c, prediction_feature) + end + end + task.progress( task_status += task_step ) if task # loaded actual values + + prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid + raise "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset + + # allow missing prediction feature if there are no compounds in the prediction dataset + raise "predicted_variable not found in prediction_dataset\n"+ + "predicted_variable '"+predicted_variable.to_s+"'\n"+ + "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ + "available features are: "+prediction_dataset.features.inspect if prediction_dataset.features.keys.index(predicted_variable)==nil and prediction_dataset.compounds.size>0 + raise "predicted_confidence not found in prediction_dataset\n"+ + "predicted_confidence '"+predicted_confidence.to_s+"'\n"+ + "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ + "available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.features.keys.index(predicted_confidence)==nil and prediction_dataset.compounds.size>0 + + raise "more predicted than test compounds, #test: "+compounds.size.to_s+" < #prediction: "+ + prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+ + prediction_dataset_uri if compounds.size < prediction_dataset.compounds.size + if CHECK_VALUES + prediction_dataset.compounds.each do |c| + raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+ + compounds.collect{|c| c.to_s}.join("\n") if compounds.index(c)==nil + end + end + + predicted_values = [] + confidence_values = [] + count = 0 + compounds.each do |c| + if prediction_dataset.compounds.index(c)==nil + predicted_values << nil + confidence_values << nil + else + case feature_type + when "classification" + predicted_values << classification_val(prediction_dataset, c, predicted_variable, accept_values) + when "regression" + predicted_values << regression_val(prediction_dataset, c, predicted_variable) + end + if predicted_confidence + confidence_values << confidence_val(prediction_dataset, c, predicted_confidence) + else + confidence_values << nil + end + end + count += 1 + end + all_compounds += compounds + all_predicted_values += predicted_values + all_actual_values += actual_values + all_confidence_values += confidence_values + + task.progress( task_status += task_step ) if task # loaded predicted values and confidence + end + + #sort according to confidence if available + if all_confidence_values.compact.size>0 + values = [] + all_predicted_values.size.times do |i| + values << [all_predicted_values[i], all_actual_values[i], all_confidence_values[i], all_compounds[i]] + end + values = values.sort_by{ |v| v[2] || 0 }.reverse # sorting by confidence + all_predicted_values = [] + all_actual_values = [] + all_confidence_values = [] + all_compounds = [] + values.each do |v| + all_predicted_values << v[0] + all_actual_values << v[1] + all_confidence_values << v[2] + all_compounds << v[3] + end + end + + raise "illegal num compounds "+all_compounds.size.to_s+" != "+all_predicted_values.size.to_s if + all_compounds.size != all_predicted_values.size + task.progress(100) if task # done with the mathmatics + data = { :predicted_values => all_predicted_values, :actual_values => all_actual_values, :confidence_values => all_confidence_values, + :feature_type => feature_type, :accept_values => accept_values } + + PredictionData.new(data, all_compounds) + end + + private + def initialize( data, compounds ) + @data = data + @compounds = compounds + end + + private + def self.regression_val(dataset, compound, feature) + v = value(dataset, compound, feature) + begin + v = v.to_f unless v==nil or v.is_a?(Numeric) + v + rescue + LOGGER.warn "no numeric value for regression: '"+v.to_s+"'" + nil + end + end + + def self.confidence_val(dataset, compound, confidence) + v = value(dataset, compound, confidence) + begin + v = v.to_f unless v==nil or v.is_a?(Numeric) + v + rescue + LOGGER.warn "no numeric value for confidence '"+v.to_s+"'" + nil + end + end + + def self.classification_val(dataset, compound, feature, accept_values) + v = value(dataset, compound, feature) + i = accept_values.index(v.to_s) + raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+ + accept_values.inspect unless v==nil or i!=nil + i + end + + def self.value(dataset, compound, feature) + return nil if dataset.data_entries[compound]==nil + if feature==nil + v = dataset.data_entries[compound].values[0] + else + v = dataset.data_entries[compound][feature] + end + return nil if v==nil + raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array) + if v.size>1 + v.uniq! + if v.size>1 + v = nil + LOGGER.warn "not yet implemented: multiple non-equal values "+compound.to_s+" "+v.inspect + else + v = v[0] + end + elsif v.size==1 + v = v[0] + else + v = nil + end + raise "array" if v.is_a?(Array) + v = nil if v.to_s.size==0 + v + end + end +end \ No newline at end of file diff --git a/lib/predictions.rb b/lib/predictions.rb index bd32efb..233267d 100755 --- a/lib/predictions.rb +++ b/lib/predictions.rb @@ -1,4 +1,6 @@ +require "lib/prediction_data.rb" + module Lib module Util @@ -19,36 +21,11 @@ module Lib return instance_index.to_s end - def data - { :predicted_values => @predicted_values, :actual_values => @actual_values, :confidence_values => @confidence_values, - :feature_type => @feature_type, :accept_values => @accept_values } - end - - def self.from_data( data, min_confidence=nil, prediction_index=nil ) - if min_confidence!=nil - valid_indices = [] - data[:confidence_values].size.times do |i| - valid_indices << i if prediction_index==data[:predicted_values][i] and - (valid_indices.size<=12 or data[:confidence_values][i]>=min_confidence) - end - [ :predicted_values, :actual_values, :confidence_values ].each do |key| - arr = [] - valid_indices.each{|i| arr << data[key][i]} - data[key] = arr - end - end - Predictions.new( data[:predicted_values], data[:actual_values], data[:confidence_values], - data[:feature_type], data[:accept_values] ) - end - - def initialize( predicted_values, - actual_values, - confidence_values, - feature_type, - accept_values=nil ) + def initialize( data ) + raise unless data.is_a?(Hash) - @feature_type = feature_type - @accept_values = accept_values + @feature_type = data[:feature_type] + @accept_values = data[:accept_values] @num_classes = 1 #puts "predicted: "+predicted_values.inspect @@ -57,11 +34,11 @@ module Lib raise "unknown feature_type: '"+@feature_type.to_s+"'" unless @feature_type=="classification" || @feature_type=="regression" - raise "no predictions" if predicted_values.size == 0 - num_info = "predicted:"+predicted_values.size.to_s+ - " confidence:"+confidence_values.size.to_s+" actual:"+actual_values.size.to_s - raise "illegal num actual values "+num_info if actual_values.size != predicted_values.size - raise "illegal num confidence values "+num_info if confidence_values.size != predicted_values.size + raise "no predictions" if data[:predicted_values].size == 0 + num_info = "predicted:"+data[:predicted_values].size.to_s+ + " confidence:"+data[:confidence_values].size.to_s+" actual:"+data[:actual_values].size.to_s + raise "illegal num actual values "+num_info if data[:actual_values].size != data[:predicted_values].size + raise "illegal num confidence values "+num_info if data[:confidence_values].size != data[:predicted_values].size case @feature_type when "classification" @@ -76,8 +53,8 @@ module Lib @actual_values = [] @confidence_values = [] init_stats() - (0..predicted_values.size-1).each do |i| - update_stats( predicted_values[i], actual_values[i], confidence_values[i] ) + (0..data[:predicted_values].size-1).each do |i| + update_stats( data[:predicted_values][i], data[:actual_values][i], data[:confidence_values][i] ) end end diff --git a/lib/validation_db.rb b/lib/validation_db.rb index f770dc2..c3a3f71 100755 --- a/lib/validation_db.rb +++ b/lib/validation_db.rb @@ -72,7 +72,7 @@ module Validation attribute :classification_statistics_yaml attribute :regression_statistics_yaml attribute :finished - attribute :prediction_data + attribute :prediction_data_yaml index :model_uri index :validation_type @@ -100,6 +100,14 @@ module Validation def regression_statistics=(rs) self.regression_statistics_yaml = rs.to_yaml end + + def prediction_data + YAML.load(self.prediction_data_yaml) if self.prediction_data_yaml + end + + def prediction_data=(pd) + self.prediction_data_yaml = pd.to_yaml + end def save super diff --git a/report/plot_factory.rb b/report/plot_factory.rb index 6083d26..2d7946f 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -338,7 +338,6 @@ module Reports accept_values = validation_set.unique_feature_type=="classification" ? validation_set.get_accept_values : nil if (validation_set.size > 1) - names = []; performance = []; confidence = []; faint = [] sum_confidence_values = { :predicted_values => [], :actual_values => [], :confidence_values => []} @@ -378,19 +377,107 @@ module Reports end def self.demo_roc_plot -# roc_values = {:confidence_values => [0.1, 0.9, 0.5, 0.6, 0.6, 0.6], -# :predicted_values => [1, 0, 0, 1, 0, 1], -# :actual_values => [0, 1, 0, 0, 1, 1]} - roc_values = {:confidence_values => [0.9, 0.8, 0.7, 0.6, 0.5, 0.4], - :true_positives => [1, 1, 1, 0, 1, 0]} - tp_fp_rates = get_tp_fp_rates(roc_values) - labels = [] - tp_fp_rates[:youden].each do |point,confidence| - labels << ["confidence: "+confidence.to_s, point[0], point[1]] - end - + + seed = 831 #rand(1000) + puts seed + srand seed + plot_data = [] - plot_data << RubyPlot::LinePlotData.new(:name => "testname", :x_values => tp_fp_rates[:fp_rate], :y_values => tp_fp_rates[:tp_rate], :labels => labels) + n = 250 + a_cutoff = 0.5 + + a_real = [] + a_class = [] + n.times do |i| + a_real << rand + a_class << ( a_real[-1]>a_cutoff ? "a" : "b") + end + + puts a_real.to_csv + puts a_class.to_csv + + p_props = [[],[]] + p_classes = [] + + 2.times do |index| + + if (index==0) + p_noise = 0.15 + p_cutoff = 0.8 + else + p_noise = 0.5 + p_cutoff = 0.5 + end + + p_real = [] + p_class = [] + p_prop = [] + correct = [] + n.times do |i| + if rand<0.04 + p_real << rand + else + p_real << (a_real[i] + ((rand * p_noise) * (rand<0.5 ? 1 : -1))) + end + p_prop << ((p_cutoff-p_real[i]).abs) + p_class << ( p_real[-1]>p_cutoff ? "a" : "b") + correct << ((p_class[i]==a_class[i]) ? 1 : 0) + end + + puts "" + puts p_real.to_csv + puts p_class.to_csv + puts p_prop.to_csv + + p_prop_max = p_prop.max + p_prop_min = p_prop.min + p_prop_delta = p_prop_max - p_prop_min + n.times do |i| + p_prop[i] = (p_prop[i] - p_prop_min)/p_prop_delta.to_f + p_props[index][i] = p_prop[i] + end + + puts p_prop.to_csv + + p_classes << p_class + + (0..n-2).each do |i| + (i+1..n-1).each do |j| + if p_prop[i] p_prop, + :true_positives => correct} + tp_fp_rates = get_tp_fp_rates(roc_values) + labels = [] + tp_fp_rates[:youden].each do |point,confidence| + labels << ["confidence: "+confidence.to_s, point[0], point[1]] + end + + plot_data << RubyPlot::LinePlotData.new(:name => "alg"+index.to_s, + :x_values => tp_fp_rates[:fp_rate], + :y_values => tp_fp_rates[:tp_rate]) + #,:labels => labels) + end + + puts "instance,class,prediction_1,propability_1,prediction_2,propability_2" + n.times do |i| + puts (i+1).to_s+","+a_class[i].to_s+","+p_classes[0][i].to_s+ + ","+p_props[0][i].to_s+ + ","+p_classes[1][i].to_s+","+p_props[1][i].to_s + end RubyPlot::plot_lines("/tmp/plot.png", "ROC-Plot", "False positive rate", @@ -424,7 +511,9 @@ module Reports conf.pop end if (predictions == nil) - predictions = Lib::Predictions.new([p[i]],[a[i]],[c[i]],feature_type, accept_values) + data = {:predicted_values => [p[i]],:actual_values => [a[i]], :confidence_values => [c[i]], + :feature_type => feature_type, :accept_values => accept_values} + predictions = Lib::Predictions.new(data) else predictions.update_stats(p[i], a[i], c[i]) end @@ -528,7 +617,20 @@ end #require "rubygems" #require "ruby-plot" -##Reports::PlotFactory::demo_ranking_plot +###Reports::PlotFactory::demo_ranking_plot +#class Array +# def sum +# inject( nil ) { |sum,x| sum ? sum+x : x } +# end +# +# def to_csv +# s = "" +# each do |x| +# s += (x.is_a?(Float) ? ("%.3f"%x) : (" "+x.to_s) )+", " +# end +# s +# end +#end #Reports::PlotFactory::demo_roc_plot #a = [1, 0, 1, 2, 3, 0, 2] diff --git a/report/report_content.rb b/report/report_content.rb index 61db340..3d92b52 100755 --- a/report/report_content.rb +++ b/report/report_content.rb @@ -22,6 +22,12 @@ class Reports::ReportContent @current_section = @xml_report.get_root_element end + def add_warning(warning) + sec = @xml_report.add_section(@current_section, "Warning") + @xml_report.add_paragraph(sec, warning) + end_section() + end + def add_paired_ttest_tables( validation_set, group_attribute, test_attributes, diff --git a/report/report_factory.rb b/report/report_factory.rb index 484cf12..2b978c5 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -63,14 +63,26 @@ module Reports::ReportFactory end end - def self.create_report_validation(validation_set, task=nil) + def self.add_filter_warning(report, filter_params) + msg = "The validation results for this report have been filtered." + msg += " Minimum confidence: "+ filter_params[:min_confidence].to_s if + filter_params[:min_confidence]!=nil + msg += " Minimum number of predictions (sorted with confidence): "+ filter_params[:min_num_predictions].to_s if + filter_params[:min_num_predictions]!=nil + msg += " Maximum number of predictions: "+ filter_params[:max_num_predictions].to_s if + filter_params[:max_num_predictions]!=nil + report.add_warning(msg) + end + + def self.create_report_validation(validation_set, params, task=nil) raise OpenTox::BadRequestError.new("num validations is not equal to 1") unless validation_set.size==1 val = validation_set.validations[0] pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) report = Reports::ReportContent.new("Validation report") - + add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil + case val.feature_type when "classification" report.add_result(validation_set, [:validation_uri] + VAL_ATTR_TRAIN_TEST + VAL_ATTR_CLASS, "Results", "Results") @@ -109,8 +121,9 @@ module Reports::ReportFactory report end - def self.create_report_crossvalidation(validation_set, task=nil) + def self.create_report_crossvalidation(validation_set, params, task=nil) + raise OpenTox::BadRequestError.new "cv report not implemented for filter params" if validation_set.filter_params!=nil raise OpenTox::BadRequestError.new("num validations is not >1") unless validation_set.size>1 raise OpenTox::BadRequestError.new("crossvalidation-id not unique and != nil: "+ validation_set.get_values(:crossvalidation_id,false).inspect) if validation_set.unique_value(:crossvalidation_id)==nil @@ -119,7 +132,7 @@ module Reports::ReportFactory validation_set.unique_value(:num_folds).to_s+")") unless validation_set.unique_value(:num_folds).to_i==validation_set.size raise OpenTox::BadRequestError.new("num different folds is not equal to num validations") unless validation_set.num_different_values(:crossvalidation_fold)==validation_set.size raise OpenTox::BadRequestError.new("validations must have unique feature type, i.e. must be either all regression, "+ - "or all classification validations") unless validation_set.unique_feature_type + "or all classification validations") unless validation_set.unique_feature_type pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) validation_set.validations.sort! do |x,y| x.crossvalidation_fold.to_f <=> y.crossvalidation_fold.to_f @@ -138,13 +151,12 @@ module Reports::ReportFactory report.add_confusion_matrix(cv_set.validations[0]) report.add_section("Plots") [nil, :crossvalidation_fold].each do |split_attribute| - if (validation_set.get_accept_values.size == 2) if validation_set.get_true_accept_value!=nil report.add_roc_plot(validation_set, validation_set.get_true_accept_value,split_attribute) else - report.add_roc_plot(validation_set, validation_set.get_accept_values[0],split_attribute) - report.add_roc_plot(validation_set, validation_set.get_accept_values[1],split_attribute) + report.add_roc_plot(validation_set, validation_set.get_accept_values[0], split_attribute) + report.add_roc_plot(validation_set, validation_set.get_accept_values[1], split_attribute) report.align_last_two_images "ROC Plots" end end @@ -156,7 +168,8 @@ module Reports::ReportFactory end end report.end_section - report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri], + report.add_result(validation_set, + [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") when "regression" report.add_result(cv_set, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],res_titel, res_titel, res_text) @@ -169,7 +182,9 @@ module Reports::ReportFactory report.add_confidence_plot(validation_set, :r_square, nil, :crossvalidation_fold) report.align_last_two_images "Confidence Plots Across Folds" report.end_section - report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") + report.add_result(validation_set, + [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], + "Results","Results") end task.progress(90) if task @@ -219,6 +234,7 @@ module Reports::ReportFactory pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) report = Reports::ReportContent.new("Algorithm comparison report") + add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil if (validation_set.num_different_values(:dataset_uri)>1) all_merged = validation_set.merge([:algorithm_uri, :dataset_uri, :crossvalidation_id, :crossvalidation_uri]) diff --git a/report/report_service.rb b/report/report_service.rb index f299122..53a17ab 100644 --- a/report/report_service.rb +++ b/report/report_service.rb @@ -72,7 +72,15 @@ module Reports LOGGER.debug "identifier: '"+identifier.inspect+"'" raise "illegal num identifiers: "+identifier.size.to_s+" should be equal to num validation-uris ("+validation_uris.size.to_s+")" if identifier and identifier.size!=validation_uris.size - validation_set = Reports::ValidationSet.new(validation_uris, identifier, subjectid) + + filter_params = nil + [:min_confidence, :min_num_predictions, :max_num_predictions].each do |key| + if params[key] != nil + filter_params = {} unless filter_params + filter_params[key] = params[key].to_f + end + end + validation_set = Reports::ValidationSet.new(validation_uris, identifier, filter_params, subjectid) raise OpenTox::BadRequestError.new("cannot get validations from validation_uris '"+validation_uris.inspect+"'") unless validation_set and validation_set.size > 0 LOGGER.debug "loaded "+validation_set.size.to_s+" validation/s" task.progress(10) if task diff --git a/report/statistical_test.rb b/report/statistical_test.rb index 8d6bd62..4d85555 100644 --- a/report/statistical_test.rb +++ b/report/statistical_test.rb @@ -69,8 +69,8 @@ module Reports def self.paired_ttest( validations1, validations2, attribute, class_value, significance_level=0.95 ) - array1 = validations1.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value] : v.send(attribute)) } - array2 = validations2.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value] : v.send(attribute)) } + array1 = validations1.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } + array2 = validations2.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } LOGGER.debug "paired-t-testing "+attribute.to_s+" "+array1.inspect+" vs "+array2.inspect LIB::StatisticalTest.pairedTTest(array1, array2, significance_level) end @@ -83,12 +83,16 @@ module Reports end -#t1 = Time.new -#10.times do -# puts LIB::StatisticalTest.pairedTTest([1,2,3,4,5,12,4,2],[2,3,3,3,56,3,4,5]) -#end -#LIB::StatisticalTest.quitR -#t2 = Time.new -#puts t2-t1 +#x=["1.36840891838074", "2.89500403404236", "2.58440494537354", "1.96544003486633", "1.4017288684845", "1.68250012397766", "1.65089893341064", "2.24862003326416", "3.73909902572632", "2.36335206031799"] +#y=["1.9675121307373", "2.30981087684631", "2.59359288215637", "2.62243509292603", "1.98700189590454", "2.26789593696594", "2.03917217254639", "2.69466996192932", "1.96487307548523", "1.65820598602295"] +#puts LIB::StatisticalTest.pairedTTest(x,y) +# +##t1 = Time.new +##10.times do +# puts LIB::StatisticalTest.pairedTTest([1.01,2,3,4,5,12,4,2],[2,3,3,3,56,3,4,5]) +##end +#LIB::StatisticalTest.quit_r +##t2 = Time.new +##puts t2-t1 diff --git a/report/validation_access.rb b/report/validation_access.rb index 3b5335c..536923d 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -13,7 +13,7 @@ class Reports::ValidationDB self_uri.host == val_uri.host && self_uri.port == val_uri.port end - def resolve_cv_uris(validation_uris, identifier=nil, subjectid=nil) + def resolve_cv_uris(validation_uris, identifier, subjectid) res = {} count = 0 validation_uris.each do |u| @@ -47,8 +47,8 @@ class Reports::ValidationDB res end - def init_validation(validation, uri, subjectid=nil) - + def init_validation(validation, uri, filter_params, subjectid) + raise OpenTox::BadRequestError.new "not a validation uri: "+uri.to_s unless uri =~ /\/[0-9]+$/ validation_id = uri.split("/")[-1] raise OpenTox::BadRequestError.new "invalid validation id "+validation_id.to_s unless validation_id!=nil and @@ -63,6 +63,9 @@ class Reports::ValidationDB else v = YAML::load(OpenTox::RestClientWrapper.get uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) end + v.filter_predictions(filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions]) if + filter_params + raise OpenTox::NotFoundError.new "validation with id "+validation_id.to_s+" not found" unless v raise OpenTox::BadRequestError.new "validation with id "+validation_id.to_s+" is not finished yet" unless v.finished (Validation::VAL_PROPS + Validation::VAL_CV_PROPS).each do |p| @@ -80,7 +83,7 @@ class Reports::ValidationDB end end - def init_validation_from_cv_statistics( validation, cv_uri, subjectid=nil ) + def init_validation_from_cv_statistics( validation, cv_uri, filter_params, subjectid ) raise OpenTox::BadRequestError.new "not a crossvalidation uri: "+cv_uri.to_s unless cv_uri.uri? and cv_uri =~ /crossvalidation.*\/[0-9]+$/ @@ -96,6 +99,9 @@ class Reports::ValidationDB cv = YAML::load(OpenTox::RestClientWrapper.get cv_uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) v = YAML::load(OpenTox::RestClientWrapper.get cv_uri+"/statistics", {:subjectid=>subjectid, :accept=>"application/serialize"}) end + v.filter_predictions(filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions]) if + filter_params + (Validation::VAL_PROPS + Validation::VAL_CV_PROPS).each do |p| validation.send("#{p.to_s}=".to_sym, v.send(p)) end @@ -126,11 +132,14 @@ class Reports::ValidationDB end end - def get_predictions(validation, subjectid=nil, task=nil) - - Lib::OTPredictions.new( validation.feature_type, validation.test_dataset_uri, + def get_predictions(validation, filter_params, subjectid, task) + # we need compound info, cannot reuse stored prediction data + data = Lib::PredictionData.create( validation.feature_type, validation.test_dataset_uri, validation.test_target_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri, - validation.predicted_variable, validation.predicted_confidence, subjectid, task) + validation.predicted_variable, validation.predicted_confidence, subjectid, task ) + data = Lib::PredictionData.filter_data( data.data, data.compounds, + filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions] ) if filter_params!=nil + Lib::OTPredictions.new( data.data, data.compounds ) end def get_accept_values( validation, subjectid=nil ) diff --git a/report/validation_data.rb b/report/validation_data.rb index 61761ab..e91348d 100755 --- a/report/validation_data.rb +++ b/report/validation_data.rb @@ -86,18 +86,20 @@ module Reports VAL_ATTR_RANKING.collect{ |a| (a.to_s+"_ranking").to_sym } @@validation_attributes.each{ |a| attr_accessor a } - attr_reader :predictions, :subjectid + attr_reader :predictions, :subjectid, :filter_params attr_accessor :identifier, :validation_report_uri, :crossvalidation_report_uri - def initialize(uri = nil, subjectid = nil) - Reports.validation_access.init_validation(self, uri, subjectid) if uri + def initialize(uri = nil, filter_params=nil, subjectid = nil) + Reports.validation_access.init_validation(self, uri, filter_params, subjectid) if uri @subjectid = subjectid + raise unless filter_params==nil || filter_params.is_a?(Hash) + @filter_params = filter_params #raise "subjectid is nil" unless subjectid end - def self.from_cv_statistics( cv_uri, subjectid = nil ) - v = ReportValidation.new(nil, subjectid) - Reports.validation_access.init_validation_from_cv_statistics(v, cv_uri, subjectid) + def self.from_cv_statistics( cv_uri, filter_params, subjectid ) + v = ReportValidation.new(nil, filter_params, subjectid) + Reports.validation_access.init_validation_from_cv_statistics(v, cv_uri, filter_params, subjectid) v end @@ -116,7 +118,7 @@ module Reports task.progress(100) if task nil else - @predictions = Reports.validation_access.get_predictions( self, @subjectid, task ) + @predictions = Reports.validation_access.get_predictions( self, @filter_params, @subjectid, task ) end end end @@ -167,13 +169,13 @@ module Reports # class ValidationSet - def initialize(validation_uris=nil, identifier=nil, subjectid=nil) + def initialize(validation_uris=nil, identifier=nil, filter_params=nil, subjectid=nil) @unique_values = {} @validations = [] if validation_uris validation_uri_and_ids = ReportValidation.resolve_cv_uris(validation_uris, identifier, subjectid) validation_uri_and_ids.each do |u,id| - v = ReportValidation.new(u, subjectid) + v = ReportValidation.new(u, filter_params, subjectid) v.identifier = id if id ids = Reports.persistance.list_reports("validation",{:validation_uris=>v.validation_uri }) v.validation_report_uri = ReportService.instance.get_uri("validation",ids[-1]) if ids and ids.size>0 @@ -228,6 +230,10 @@ module Reports return false end + def filter_params + @validations.first.filter_params + end + # loads the attributes of the related crossvalidation into all validation objects # def load_cv_attributes @@ -424,7 +430,7 @@ module Reports new_set = ValidationSet.new grouping = Util.group(@validations, [:crossvalidation_id]) grouping.each do |g| - v = ReportValidation.from_cv_statistics(g[0].crossvalidation_uri, g[0].subjectid) + v = ReportValidation.from_cv_statistics(g[0].crossvalidation_uri, @validations.first.filter_params, g[0].subjectid) v.identifier = g.collect{|vv| vv.identifier}.uniq.join(";") new_set.validations << v end diff --git a/test/test_examples_util.rb b/test/test_examples_util.rb index a5f2867..b48096d 100755 --- a/test/test_examples_util.rb +++ b/test/test_examples_util.rb @@ -299,7 +299,8 @@ module ValidationExamples def report( waiting_task=nil ) #begin - @report_uri = Util.validation_post '/report/'+report_type,{:validation_uris => @validation_uri},@subjectid,waiting_task if @validation_uri + @report_uri = Util.validation_post '/report/'+report_type,{:validation_uris => @validation_uri}, + @subjectid,waiting_task if @validation_uri Util.validation_get "/report/"+report_uri.split("/")[-2]+"/"+report_uri.split("/")[-1], @subjectid if @report_uri #rescue => ex #puts "could not create report: "+ex.message diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 0647b10..f126679 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -226,34 +226,34 @@ end # Validation::Validation.find( :all, :conditions => { :crossvalidation_id => params[:id] } ).collect{ |v| v.validation_uri.to_s }.join("\n")+"\n" #end -get '/crossvalidation/:id/predictions' do - LOGGER.info "get predictions for crossvalidation with id "+params[:id].to_s - begin - #crossvalidation = Validation::Crossvalidation.find(params[:id]) - crossvalidation = Validation::Crossvalidation.get(params[:id]) - rescue ActiveRecord::RecordNotFound => ex - raise OpenTox::NotFoundError.new "Crossvalidation '#{params[:id]}' not found." - end - raise OpenTox::BadRequestError.new "Crossvalidation '"+params[:id].to_s+"' not finished" unless crossvalidation.finished - - content_type "application/x-yaml" - validations = Validation::Validation.find( :crossvalidation_id => params[:id], :validation_type => "crossvalidation" ) - p = Lib::OTPredictions.to_array( validations.collect{ |v| v.compute_validation_stats_with_model(nil, true) } ).to_yaml - - case request.env['HTTP_ACCEPT'].to_s - when /text\/html/ - content_type "text/html" - description = - "The crossvalidation predictions as (yaml-)array." - related_links = - "All crossvalidations: "+url_for("/crossvalidation",:full)+"\n"+ - "Correspoding crossvalidation: "+url_for("/crossvalidation/"+params[:id],:full) - OpenTox.text_to_html p,@subjectid, related_links, description - else - content_type "text/x-yaml" - p - end -end +#get '/crossvalidation/:id/predictions' do +# LOGGER.info "get predictions for crossvalidation with id "+params[:id].to_s +# begin +# #crossvalidation = Validation::Crossvalidation.find(params[:id]) +# crossvalidation = Validation::Crossvalidation.get(params[:id]) +# rescue ActiveRecord::RecordNotFound => ex +# raise OpenTox::NotFoundError.new "Crossvalidation '#{params[:id]}' not found." +# end +# raise OpenTox::BadRequestError.new "Crossvalidation '"+params[:id].to_s+"' not finished" unless crossvalidation.finished +# +# content_type "application/x-yaml" +# validations = Validation::Validation.find( :crossvalidation_id => params[:id], :validation_type => "crossvalidation" ) +# p = Lib::OTPredictions.to_array( validations.collect{ |v| v.compute_validation_stats_with_model(nil, true) } ).to_yaml +# +# case request.env['HTTP_ACCEPT'].to_s +# when /text\/html/ +# content_type "text/html" +# description = +# "The crossvalidation predictions as (yaml-)array." +# related_links = +# "All crossvalidations: "+url_for("/crossvalidation",:full)+"\n"+ +# "Correspoding crossvalidation: "+url_for("/crossvalidation/"+params[:id],:full) +# OpenTox.text_to_html p,@subjectid, related_links, description +# else +# content_type "text/x-yaml" +# p +# end +#end get '/?' do @@ -595,30 +595,30 @@ get '/:id/probabilities' do end -get '/:id/predictions' do - LOGGER.info "get validation predictions "+params.inspect - begin - #validation = Validation::Validation.find(params[:id]) - validation = Validation::Validation.get(params[:id]) - rescue ActiveRecord::RecordNotFound => ex - raise OpenTox::NotFoundError.new "Validation '#{params[:id]}' not found." - end - raise OpenTox::BadRequestError.new "Validation '"+params[:id].to_s+"' not finished" unless validation.finished - p = validation.compute_validation_stats_with_model(nil, true) - case request.env['HTTP_ACCEPT'].to_s - when /text\/html/ - content_type "text/html" - description = - "The validation predictions as (yaml-)array." - related_links = - "All validations: "+url_for("/",:full)+"\n"+ - "Correspoding validation: "+url_for("/"+params[:id],:full) - OpenTox.text_to_html p.to_array.to_yaml,@subjectid, related_links, description - else - content_type "text/x-yaml" - p.to_array.to_yaml - end -end +#get '/:id/predictions' do +# LOGGER.info "get validation predictions "+params.inspect +# begin +# #validation = Validation::Validation.find(params[:id]) +# validation = Validation::Validation.get(params[:id]) +# rescue ActiveRecord::RecordNotFound => ex +# raise OpenTox::NotFoundError.new "Validation '#{params[:id]}' not found." +# end +# raise OpenTox::BadRequestError.new "Validation '"+params[:id].to_s+"' not finished" unless validation.finished +# p = validation.compute_validation_stats_with_model(nil, true) +# case request.env['HTTP_ACCEPT'].to_s +# when /text\/html/ +# content_type "text/html" +# description = +# "The validation predictions as (yaml-)array." +# related_links = +# "All validations: "+url_for("/",:full)+"\n"+ +# "Correspoding validation: "+url_for("/"+params[:id],:full) +# OpenTox.text_to_html p.to_array.to_yaml,@subjectid, related_links, description +# else +# content_type "text/x-yaml" +# p.to_array.to_yaml +# end +#end #get '/:id/:attribute' do # LOGGER.info "access validation attribute "+params.inspect diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 2b8a18f..7f853ca 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -38,32 +38,12 @@ module Validation crossvalidation = Crossvalidation.get(cv_id) raise OpenTox::NotFoundError.new "Crossvalidation '#{cv_id}' not found." unless crossvalidation raise OpenTox::BadRequestError.new "Crossvalidation '"+cv_id.to_s+"' not finished" unless crossvalidation.finished - vals = Validation.find( :crossvalidation_id => cv_id, :validation_type => "crossvalidation" ).collect{|x| x} - models = vals.collect{|v| OpenTox::Model::Generic.find(v.model_uri, subjectid)} - feature_type = models.first.feature_type(subjectid) - test_dataset_uris = vals.collect{|v| v.test_dataset_uri} - test_target_dataset_uris = vals.collect{|v| v.test_target_dataset_uri} - prediction_feature = vals.first.prediction_feature - prediction_dataset_uris = vals.collect{|v| v.prediction_dataset_uri} - predicted_variables = models.collect{|m| m.predicted_variable(subjectid)} - predicted_confidences = models.collect{|m| m.predicted_confidence(subjectid)} - prediction = Lib::OTPredictions.new( feature_type, test_dataset_uris, test_target_dataset_uris, prediction_feature, - prediction_dataset_uris, predicted_variables, predicted_confidences, subjectid, OpenTox::SubTask.create(waiting_task, 0, 90) ) - + v = Validation.new - case feature_type - when "classification" - v.classification_statistics = prediction.compute_stats - when "regression" - v.regression_statistics = prediction.compute_stats - end - v.update :num_instances => prediction.num_instances, - :num_without_class => prediction.num_without_class, - :percent_without_class => prediction.percent_without_class, - :num_unpredicted => prediction.num_unpredicted, - :percent_unpredicted => prediction.percent_unpredicted, - :finished => true + v.compute_prediction_data_with_cv(vals, waiting_task) + v.compute_validation_stats() + (VAL_PROPS_GENERAL-[:validation_uri]).each do |p| v.send("#{p.to_s}=".to_sym, vals.collect{ |vv| vv.send(p) }.uniq.join(";")) end @@ -72,7 +52,6 @@ module Validation v.crossvalidation_id = crossvalidation.id v.crossvalidation_fold = vals.collect{ |vv| vv.crossvalidation_fold }.uniq.join(";") v.real_runtime = vals.collect{ |vv| vv.real_runtime }.uniq.join(";") - v.prediction_data = prediction.data.to_yaml v.save end waiting_task.progress(100) if waiting_task @@ -200,13 +179,26 @@ module Validation self.prediction_dataset_uri = prediction_dataset_uri self.real_runtime = benchmark.real - compute_validation_stats_with_model( model, false, OpenTox::SubTask.create(task, 50, 100) ) + compute_prediction_data_with_model( model, OpenTox::SubTask.create(task, 50, 100) ) + compute_validation_stats() end - - def compute_validation_stats_with_model( model=nil, dry_run=false, task=nil ) - - #model = OpenTox::Model::PredictionModel.find(self.model_uri) if model==nil and self.model_uri - #raise OpenTox::NotFoundError.new "model not found: "+self.model_uri.to_s unless model + + def compute_prediction_data_with_cv(cv_vals, waiting_task=nil) + models = cv_vals.collect{|v| OpenTox::Model::Generic.find(v.model_uri, subjectid)} + feature_type = models.first.feature_type(subjectid) + test_dataset_uris = cv_vals.collect{|v| v.test_dataset_uri} + test_target_dataset_uris = cv_vals.collect{|v| v.test_target_dataset_uri} + prediction_feature = cv_vals.first.prediction_feature + prediction_dataset_uris = cv_vals.collect{|v| v.prediction_dataset_uri} + predicted_variables = models.collect{|m| m.predicted_variable(subjectid)} + predicted_confidences = models.collect{|m| m.predicted_confidence(subjectid)} + p_data = Lib::PredictionData.create( feature_type, test_dataset_uris, test_target_dataset_uris, prediction_feature, + prediction_dataset_uris, predicted_variables, predicted_confidences, subjectid, waiting_task ) + self.prediction_data = p_data.data + p_data.data + end + + def compute_prediction_data_with_model(model=nil, task=nil) model = OpenTox::Model::Generic.find(self.model_uri, self.subjectid) if model==nil and self.model_uri raise OpenTox::NotFoundError.new "model not found: "+self.model_uri.to_s unless model @@ -219,76 +211,82 @@ module Validation raise "cannot determine whether model '"+model.uri.to_s+"' performs classification or regression, "+ "please set rdf-type of predictedVariables feature '"+predicted_variable.to_s+ "' to NominalFeature or NumericFeature" if (feature_type.to_s!="classification" and feature_type.to_s!="regression") - compute_validation_stats( feature_type, predicted_variable, predicted_confidence, - prediction_feature, algorithm_uri, dry_run, task ) + compute_prediction_data( feature_type, predicted_variable, predicted_confidence, + prediction_feature, algorithm_uri, task ) end - - def compute_validation_stats( feature_type, predicted_variable, predicted_confidence, prediction_feature, - algorithm_uri, dry_run, task ) - -# self.attributes = { :prediction_feature => prediction_feature } if self.prediction_feature==nil && prediction_feature -# self.attributes = { :algorithm_uri => algorithm_uri } if self.algorithm_uri==nil && algorithm_uri -# self.save! -# self.update :prediction_feature => prediction_feature if self.prediction_feature==nil && prediction_feature -# self.update :algorithm_uri => algorithm_uri if self.algorithm_uri==nil && algorithm_uri + + def compute_prediction_data( feature_type, predicted_variable, predicted_confidence, prediction_feature, + algorithm_uri, task ) self.prediction_feature = prediction_feature if self.prediction_feature==nil && prediction_feature self.algorithm_uri = algorithm_uri if self.algorithm_uri==nil && algorithm_uri - + LOGGER.debug "computing prediction stats" - prediction = Lib::OTPredictions.new( feature_type, + p_data = Lib::PredictionData.create( feature_type, self.test_dataset_uri, self.test_target_dataset_uri, self.prediction_feature, self.prediction_dataset_uri, predicted_variable, predicted_confidence, self.subjectid, OpenTox::SubTask.create(task, 0, 80) ) - #reading datasets and computing the main stats is 80% the work - - unless dry_run - case feature_type - when "classification" - #self.attributes = { :classification_statistics => prediction.compute_stats } - #self.update :classification_statistics => prediction.compute_stats - self.classification_statistics = prediction.compute_stats - when "regression" - #self.attributes = { :regression_statistics => prediction.compute_stats } - self.regression_statistics = prediction.compute_stats - end -# self.attributes = { :num_instances => prediction.num_instances, -# :num_without_class => prediction.num_without_class, -# :percent_without_class => prediction.percent_without_class, -# :num_unpredicted => prediction.num_unpredicted, -# :percent_unpredicted => prediction.percent_unpredicted, -# :finished => true} -# self.save! - self.update :num_instances => prediction.num_instances, - :num_without_class => prediction.num_without_class, - :percent_without_class => prediction.percent_without_class, - :num_unpredicted => prediction.num_unpredicted, - :percent_unpredicted => prediction.percent_unpredicted, - :prediction_data => prediction.data.to_yaml, - :finished => true - raise unless self.valid? - end - + self.prediction_data = p_data.data task.progress(100) if task - prediction + p_data.data end + def compute_validation_stats( save_stats=true ) + p_data = self.prediction_data + raise "compute prediction data before" if p_data==nil + predictions = Lib::OTPredictions.new(p_data) + case p_data[:feature_type] + when "classification" + self.classification_statistics = predictions.compute_stats() + when "regression" + self.regression_statistics = predictions.compute_stats() + end + self.num_instances = predictions.num_instances + self.num_without_class = predictions.num_without_class + self.percent_without_class = predictions.percent_without_class + self.num_unpredicted = predictions.num_unpredicted + self.percent_unpredicted = predictions.percent_unpredicted + if (save_stats) + self.finished = true + self.save + raise unless self.valid? + end + end - def probabilities( confidence, prediction ) - raise OpenTox::BadRequestError.new "Only supported for classification" if classification_statistics==nil - raise OpenTox::BadRequestError.new("illegal confidence value #{confidence}") if !confidence.is_a?(Numeric) or confidence<0 or confidence>1 + def filter_predictions( min_confidence, min_num_predictions, max_num_predictions, prediction=nil ) + self.prediction_data = nil + self.save - p_data = YAML.load(self.prediction_data.to_s) - raise OpenTox::BadRequestError.new("probabilities method works only for new validations - prediction data missing") unless p_data + raise OpenTox::BadRequestError.new "only supported for classification" if prediction!=nil and classification_statistics==nil + raise OpenTox::BadRequestError.new "illegal confidence value #{min_confidence}" unless + min_confidence==nil or (min_confidence.is_a?(Numeric) and min_confidence>=0 and min_confidence<=1) + p_data = self.prediction_data + if p_data==nil + # this is to ensure backwards compatibilty + # may cause a timeout on the first run, as this is not meant to run in a task + if validation_type=="crossvalidation_statistics" + vals = Validation.find( :crossvalidation_id => self.crossvalidation_id, :validation_type => "crossvalidation" ).collect{|x| x} + compute_prediction_data_with_cv(vals) + else + compute_prediction_data_with_model + end + self.save + p_data = self.prediction_data + end raise OpenTox::BadRequestError.new("illegal prediction value: '"+prediction+"', available: "+ - p_data[:accept_values].inspect) if p_data[:accept_values].index(prediction)==nil - - p = Lib::Predictions.from_data(p_data, confidence, p_data[:accept_values].index(prediction)) - raise OpenTox::BadRequestError("no confidence values available") unless p.confidence_values_available? - + p_data[:accept_values].inspect) if prediction!=nil and p_data[:accept_values].index(prediction)==nil + p = Lib::PredictionData.filter_data(p_data, nil, min_confidence, min_num_predictions, max_num_predictions, + prediction==nil ? nil : p_data[:accept_values].index(prediction)) + self.prediction_data = p.data + compute_validation_stats(false) + end + + def probabilities( confidence, prediction ) + filter_predictions( confidence, 12, nil, prediction ) + p_data = self.prediction_data + p = Lib::Predictions.new(p_data) prediction_counts = p.confusion_matrix_row( p_data[:accept_values].index(prediction) ) sum = 0 prediction_counts.each{|v| sum+=v} - probs = {} p_data[:accept_values].size.times do |i| probs[p_data[:accept_values][i]] = prediction_counts[i]/sum.to_f -- cgit v1.2.3 From 98c22a0f22c5e4934a72dfeabbcb1fbad5a5838f Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 14 Dec 2011 16:40:01 +0100 Subject: make prediciton filtering robust to nil values --- lib/prediction_data.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/prediction_data.rb b/lib/prediction_data.rb index 154d11a..434818d 100644 --- a/lib/prediction_data.rb +++ b/lib/prediction_data.rb @@ -7,6 +7,7 @@ module Lib def self.filter_data( data, compounds, min_confidence, min_num_predictions, max_num_predictions, prediction_index=nil ) + raise "cannot filter anything, no confidence values available" if data[:confidence_values][0]==nil raise OpenTox::BadRequestError.new "please specify either min_confidence or max_num_predictions" if (min_confidence!=nil and max_num_predictions!=nil) || (min_confidence==nil and max_num_predictions==nil) raise OpenTox::BadRequestError.new "min_num_predictions only valid for min_confidence" if @@ -15,14 +16,16 @@ module Lib LOGGER.debug("filtering predictions, conf:'"+min_confidence.to_s+"' min_num_predictions: '"+ min_num_predictions.to_s+"' max_num_predictions: '"+max_num_predictions.to_s+"' ") - + #LOGGER.debug("to filter:\nconf: "+data[:confidence_values].inspect) + orig_size = data[:predicted_values].size valid_indices = [] data[:confidence_values].size.times do |i| next if prediction_index!=nil and prediction_index!=data[:predicted_values][i] valid = false if min_confidence!=nil - valid = (valid_indices.size<=min_num_predictions or data[:confidence_values][i]>=min_confidence) + valid = (valid_indices.size<=min_num_predictions or + (data[:confidence_values][i]!=nil and data[:confidence_values][i]>=min_confidence)) else valid = valid_indices.size Date: Tue, 3 Jan 2012 13:38:58 +0100 Subject: add missing subjectid when loading validation for reports --- report/validation_access.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/report/validation_access.rb b/report/validation_access.rb index 536923d..2cb0c6f 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -63,6 +63,7 @@ class Reports::ValidationDB else v = YAML::load(OpenTox::RestClientWrapper.get uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) end + v.subjectid = subjectid v.filter_predictions(filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions]) if filter_params -- cgit v1.2.3 From 4a4b743704b819dec39c672d9550038f45484160 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 4 Jan 2012 12:23:33 +0100 Subject: fix validation aa errors, add more verbose error msg for propbabilites feature --- validation/validation_application.rb | 6 ++++-- validation/validation_service.rb | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index f126679..c02b5f3 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -193,6 +193,7 @@ end get '/crossvalidation/:id/statistics/probabilities' do LOGGER.info "get crossvalidation statistics for crossvalidation with id "+params[:id].to_s + raise OpenTox::BadRequestError.new("Missing params, plz give confidence and prediction") unless params[:confidence] and params[:prediction] v = Validation::Validation.from_cv_statistics( params[:id], @subjectid ) props = v.probabilities(params[:confidence].to_s.to_f,params[:prediction].to_s) content_type "text/x-yaml" @@ -585,10 +586,11 @@ get '/:id/probabilities' do begin validation = Validation::Validation.get(params[:id]) rescue ActiveRecord::RecordNotFound => ex - raise OpenTox::NotFoundError.new "Validation '#{params[:id]}' not found." + raise OpenTox::NotFoundError.new("Validation '#{params[:id]}' not found.") end validation.subjectid = @subjectid - raise OpenTox::BadRequestError.new "Validation '"+params[:id].to_s+"' not finished" unless validation.finished + raise OpenTox::BadRequestError.new("Validation '"+params[:id].to_s+"' not finished") unless validation.finished + raise OpenTox::BadRequestError.new("Missing params, plz give confidence and prediction") unless params[:confidence] and params[:prediction] props = validation.probabilities(params[:confidence].to_s.to_f,params[:prediction].to_s) content_type "text/x-yaml" props.to_yaml diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 7f853ca..889c652 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -41,6 +41,7 @@ module Validation vals = Validation.find( :crossvalidation_id => cv_id, :validation_type => "crossvalidation" ).collect{|x| x} v = Validation.new + v.subjectid = subjectid v.compute_prediction_data_with_cv(vals, waiting_task) v.compute_validation_stats() @@ -54,6 +55,7 @@ module Validation v.real_runtime = vals.collect{ |vv| vv.real_runtime }.uniq.join(";") v.save end + v.subjectid = subjectid waiting_task.progress(100) if waiting_task v end -- cgit v1.2.3 From 26e7338a1f5b8aac257de20e70c48e8144ef0720 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Thu, 5 Jan 2012 14:23:01 +0100 Subject: fix: make sure subjectid is set in merged validation for comparison report --- report/validation_data.rb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/report/validation_data.rb b/report/validation_data.rb index e91348d..eea5229 100755 --- a/report/validation_data.rb +++ b/report/validation_data.rb @@ -86,8 +86,8 @@ module Reports VAL_ATTR_RANKING.collect{ |a| (a.to_s+"_ranking").to_sym } @@validation_attributes.each{ |a| attr_accessor a } - attr_reader :predictions, :subjectid, :filter_params - attr_accessor :identifier, :validation_report_uri, :crossvalidation_report_uri + attr_reader :predictions, :filter_params + attr_accessor :identifier, :validation_report_uri, :crossvalidation_report_uri, :subjectid def initialize(uri = nil, filter_params=nil, subjectid = nil) Reports.validation_access.init_validation(self, uri, filter_params, subjectid) if uri @@ -456,7 +456,8 @@ module Reports #merge Lib::MergeObjects.register_merge_attributes( ReportValidation, - Validation::VAL_MERGE_AVG+Validation::VAL_MERGE_SUM,[],Validation::VAL_MERGE_GENERAL+[:identifier, :validation_report_uri, :crossvalidation_report_uri]) unless + Validation::VAL_MERGE_AVG+Validation::VAL_MERGE_SUM,[], + Validation::VAL_MERGE_GENERAL+[:identifier, :validation_report_uri, :crossvalidation_report_uri, :subjectid]) unless Lib::MergeObjects.merge_attributes_registered?(ReportValidation) grouping.each do |g| new_set.validations << g[0].clone_validation -- cgit v1.2.3 From 7cf18f317d50d8ce3f7e002e3b4743910a4e656a Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 10 Jan 2012 10:45:59 +0100 Subject: change in validation: duplicates (with different values) in test-set are handled as one prediction each --- lib/prediction_data.rb | 84 +++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/lib/prediction_data.rb b/lib/prediction_data.rb index 434818d..42da5fc 100644 --- a/lib/prediction_data.rb +++ b/lib/prediction_data.rb @@ -131,14 +131,20 @@ module Lib end actual_values = [] + tmp_compounds = [] compounds.each do |c| case feature_type when "classification" - actual_values << classification_val(test_target_dataset, c, prediction_feature, accept_values) + vals = classification_vals(test_target_dataset, c, prediction_feature, accept_values) when "regression" - actual_values << regression_val(test_target_dataset, c, prediction_feature) + vals = regression_vals(test_target_dataset, c, prediction_feature) + end + vals.each do |v| + actual_values << v + tmp_compounds << c end end + compounds = tmp_compounds task.progress( task_status += task_step ) if task # loaded actual values prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid @@ -174,10 +180,12 @@ module Lib else case feature_type when "classification" - predicted_values << classification_val(prediction_dataset, c, predicted_variable, accept_values) + vals = classification_vals(prediction_dataset, c, predicted_variable, accept_values) when "regression" - predicted_values << regression_val(prediction_dataset, c, predicted_variable) + vals = regression_vals(prediction_dataset, c, predicted_variable) end + raise "not yet implemented: more than one prediction for one compound" if vals.size>1 + predicted_values << vals[0] if predicted_confidence confidence_values << confidence_val(prediction_dataset, c, predicted_confidence) else @@ -229,20 +237,28 @@ module Lib end private - def self.regression_val(dataset, compound, feature) - v = value(dataset, compound, feature) - begin - v = v.to_f unless v==nil or v.is_a?(Numeric) - v - rescue - LOGGER.warn "no numeric value for regression: '"+v.to_s+"'" - nil + def self.regression_vals(dataset, compound, feature) + v_num = [] + values(dataset, compound, feature).each do |v| + if v==nil or v.is_a?(Numeric) + v_num << v + else + begin + v_num << v.to_f + rescue + LOGGER.warn "no numeric value for regression: '"+v.to_s+"'" + v_num << nil + end + end end + v_num end def self.confidence_val(dataset, compound, confidence) - v = value(dataset, compound, confidence) + v = values(dataset, compound, confidence) + raise "not yet implemented: duplicate conf value" if v.size>1 begin + v = v[0] v = v.to_f unless v==nil or v.is_a?(Numeric) v rescue @@ -251,39 +267,31 @@ module Lib end end - def self.classification_val(dataset, compound, feature, accept_values) - v = value(dataset, compound, feature) - i = accept_values.index(v.to_s) - raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+ - accept_values.inspect unless v==nil or i!=nil - i + def self.classification_vals(dataset, compound, feature, accept_values) + v_indices = [] + values(dataset, compound, feature).each do |v| + i = accept_values.index(v.to_s) + raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+ + accept_values.inspect unless v==nil or i!=nil + v_indices << i + end + v_indices end - def self.value(dataset, compound, feature) - return nil if dataset.data_entries[compound]==nil + def self.values(dataset, compound, feature) + return [nil] if dataset.data_entries[compound]==nil if feature==nil v = dataset.data_entries[compound].values[0] else v = dataset.data_entries[compound][feature] end - return nil if v==nil + return [nil] if v==nil + # sanitiy checks raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array) - if v.size>1 - v.uniq! - if v.size>1 - v = nil - LOGGER.warn "not yet implemented: multiple non-equal values "+compound.to_s+" "+v.inspect - else - v = v[0] - end - elsif v.size==1 - v = v[0] - else - v = nil - end - raise "array" if v.is_a?(Array) - v = nil if v.to_s.size==0 - v + v.each{|vv| raise "array-elem is array" if vv.is_a?(Array)} + # replace empty strings with nil + v_mod = v.collect{|vv| (vv.to_s().size==0 ? nil : vv)} + v_mod end end end \ No newline at end of file -- cgit v1.2.3 From 9f2be4ca3bded1543142f5e3654693ce65aadb44 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 18 Jan 2012 17:54:11 +0100 Subject: add super-stratification split for training test splitting --- lib/r-util.rb | 82 +++++++++++++++++++++++ lib/stratification.R | 123 +++++++++++++++++++++++++++++++++++ validation/validation_application.rb | 8 ++- validation/validation_service.rb | 65 +++++++++--------- 4 files changed, 240 insertions(+), 38 deletions(-) create mode 100644 lib/r-util.rb create mode 100644 lib/stratification.R diff --git a/lib/r-util.rb b/lib/r-util.rb new file mode 100644 index 0000000..0d58389 --- /dev/null +++ b/lib/r-util.rb @@ -0,0 +1,82 @@ +# pending: package dir hack --------- +# CONFIG[:base_dir] = "/home//opentox-ruby/www" +# PACKAGE_DIR = "/home//opentox-ruby/r-packages" +package_dir = CONFIG[:base_dir].split("/") +package_dir[-1] = "r-packages" +package_dir = package_dir.join("/") +PACKAGE_DIR = package_dir + + + +module Lib + + module RUtil + + def self.dataset_to_dataframe( dataset ) + LOGGER.debug "convert dataset to dataframe #{dataset.uri}" + all_features = [] + dataset.features.each do |f| + feat_name = "feature_#{f[0].split("/")[-1]}" + LOGGER.debug "- adding feature: #{feat_name}" + feat = OpenTox::Feature.find(f[0]) + nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature) + values = [] + dataset.compounds.each do |c| + val = dataset.data_entries[c][f[0]] + raise "not yet implemented" if val!=nil && val.size>1 + v = val==nil ? "" : val[0].to_s + v = "NA" if v.size()==0 + values << v + end + all_features << feat_name + @@r.assign feat_name,values + @@r.eval "#{feat_name} <- as.numeric(#{feat_name})" unless nominal + end + df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" + cmd = "#{df_name} <- data.frame(#{all_features.join(",")})" + @@r.eval cmd + #@@r.eval "head(#{df_name})" + df_name + end + + def self.stratified_split( dataframe, pct=0.3, seed=42 ) + @@r.eval "set.seed(#{seed})" + @@r.eval "split <- stratified_split(#{dataframe}, ratio=#{pct})" + split = @@r.pull 'split' + split.collect{|s| s.to_i} + end + + def self.package_installed?( package ) + @@r.eval ".libPaths(\"#{PACKAGE_DIR}\")" + p = @@r.pull "installed.packages()[,1]" + p.include?(package) + end + + def self.install_packages( package ) + unless package_installed? package + @@r.eval "install.packages(\"#{package}\", repos=\"http://cran.r-project.org\", dependencies=T, lib=\"#{PACKAGE_DIR}\")" + end + end + + def self.library( package ) + install_packages( package ) + @@r.eval "library(\"#{package}\")" + end + + def self.init_r + @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r + library("sampling") + library("gam") + @@r.eval "source(\"#{PACKAGE_DIR}/stratification.R\")" + end + + def self.quit_r + begin + @@r.quit + @@r = nil + rescue + end + end + + end +end diff --git a/lib/stratification.R b/lib/stratification.R new file mode 100644 index 0000000..9aa8d1f --- /dev/null +++ b/lib/stratification.R @@ -0,0 +1,123 @@ +library("sampling") +library("gam") + +nominal_to_binary <- function( orig_data ) +{ + data = as.data.frame( orig_data ) + result = NULL + for (i in 1:ncol(data)) + { + #print(i) + if (is.numeric( data[,i] ) ) + { + if (is.null(result)) + result = data.frame(data[,i]) + else + result = data.frame(result, data[,i]) + colnames(result)[ncol(result)] <- colnames(data)[i] + } + else + { + vals = unique(data[,i]) + for (j in 1:length(vals)) + { + #print(j) + bins = c() + for (k in 1:nrow(data)) + { + if(data[,i][k] == vals[j]) + bins = c(bins,1) + else + bins = c(bins,0) + } + #print(bins) + if (is.null(result)) + result = data.frame(bins) + else + result = data.frame(result, bins) + colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j]) + if (length(vals)==2) break + } + } + } + result +} + +process_data <- function( data ) +{ + if (!is.numeric(data)) + data.num = nominal_to_binary(data) + else + data.num = data + if(any(is.na(data.num))) + data.repl = na.gam.replace(data.num) + else + data.repl = data.num + data.repl +} + +stratified_split <- function( data, ratio=0.3 ) +{ + data.processed = as.matrix(process_data( data )) + pik = rep(ratio,times=nrow(data.processed)) + data.strat = cbind(pik,data.processed) + samplecube(data.strat,pik,order=2,comment=F) +} + +stratified_k_fold_split <- function( data, num_folds=10 ) +{ + print(paste(num_folds,"-fold-split, data-size",nrow(data))) + data.processed = as.matrix(process_data( data )) + folds = rep(0, times=nrow(data)) + for (i in 1:(num_folds-1)) + { + prop = 1/(num_folds-(i-1)) + print(paste("fold",i,"/",num_folds," prop",prop)) + pik = rep(prop,times=nrow(data)) + for (j in 1:nrow(data)) + if(folds[j]!=0) + pik[j]=0 + data.strat = cbind(pik,data.processed) + s<-samplecube(data.strat,pik,order=2,comment=F) + print(paste("fold size: ",sum(s))) + for (j in 1:nrow(data)) + if (s[j] == 1) + folds[j]=i + } + for (j in 1:nrow(data)) + if (folds[j] == 0) + folds[j]=num_folds + folds +} + +plot_split <- function( data, split ) +{ + data.processed = process_data( data ) + data.pca <- prcomp(data.processed, scale=TRUE) + data.2d =as.data.frame(data.pca$x)[1:2] + plot( NULL, + xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), + xlab = "pc 1", ylab = "pc 2") + for (j in 0:max(split)) + { + set = c() + for (i in 1:nrow(data)) + if (split[i] == j) + set = c(set,i) + points(data.2d[set,], pch = 2, col=(j+1)) + } +} + +#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) +#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) +#data<-rbind(a,b) +#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) +#data<-rbind(data,c) +#data=iris +#split = stratified_k_fold_split(data, num_folds=3) +#split = stratified_split(data, ratio=0.3) +#plot_split(data,split) + + + + diff --git a/validation/validation_application.rb b/validation/validation_application.rb index c02b5f3..cda09fa 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -453,8 +453,9 @@ post '/training_test_split' do raise OpenTox::BadRequestError.new "algorithm_uri missing" unless params[:algorithm_uri].to_s.size>0 raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0 task = OpenTox::Task.create( "Perform training test split validation", url_for("/training_test_split", :full) ) do |task| #, params + strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], - @subjectid, params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) + @subjectid, strat, params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) v = Validation::Validation.create :validation_type => "training_test_split", :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], @@ -544,8 +545,9 @@ end post '/plain_training_test_split' do LOGGER.info "creating pure training test split "+params.inspect raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri] - - result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], params[:split_ratio], params[:random_seed]) + strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] + result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], @subjectid, + strat, params[:split_ratio], params[:random_seed]) content_type "text/uri-list" result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n" end diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 889c652..dceead9 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -2,6 +2,7 @@ require "lib/validation_db.rb" require "lib/ot_predictions.rb" +require "lib/r-util.rb" require "validation/validation_format.rb" @@ -618,17 +619,17 @@ module Validation # splits a dataset into test and training dataset # returns map with training_dataset_uri and test_dataset_uri - def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, split_ratio=nil, random_seed=nil, task=nil ) + def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified=false, split_ratio=nil, random_seed=nil, task=nil ) split_ratio=0.67 unless split_ratio split_ratio = split_ratio.to_f random_seed=1 unless random_seed random_seed = random_seed.to_i + raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f + raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1 orig_dataset = Lib::DatasetCache.find orig_dataset_uri, subjectid orig_dataset.load_all subjectid raise OpenTox::NotFoundError.new "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset - raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f - raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1 if prediction_feature raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+ "' not found in dataset, features are: \n"+ @@ -637,55 +638,49 @@ module Validation LOGGER.warn "no prediciton feature given, all features included in test dataset" end - compounds = orig_dataset.compounds - raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 - split = (compounds.size*split_ratio).to_i - split = [split,1].max - split = [split,compounds.size-2].min - - LOGGER.debug "splitting dataset "+orig_dataset_uri+ + if stratified + Lib::RUtil.init_r + df = Lib::RUtil.dataset_to_dataframe( orig_dataset ) + split = Lib::RUtil.stratified_split( df, split_ratio, random_seed ) + Lib::RUtil.quit_r + raise "internal error" unless split.size==orig_dataset.compounds.size + task.progress(33) if task + + training_compounds = [] + split.size.times{|i| training_compounds << orig_dataset.compounds[i] if split[i]==1} + test_compounds = orig_dataset.compounds - training_compounds + else + compounds = orig_dataset.compounds + raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 + split = (compounds.size*split_ratio).to_i + split = [split,1].max + split = [split,compounds.size-2].min + LOGGER.debug "splitting dataset "+orig_dataset_uri+ " into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compounds.size-1).to_s+ " (shuffled with seed "+random_seed.to_s+")" - compounds.shuffle!( random_seed ) + compounds.shuffle!( random_seed ) + training_compounds = compounds[0..split] + test_compounds = compounds[(split+1)..-1] + end task.progress(33) if task - - result = {} -# result[:training_dataset_uri] = orig_dataset.create_new_dataset( compounds[0..split], -# orig_dataset.features, -# "Training dataset split of "+orig_dataset.title.to_s, -# $sinatra.url_for('/training_test_split',:full) ) -# orig_dataset.data_entries.each do |k,v| -# puts k.inspect+" =>"+v.inspect -# puts v.values[0].to_s+" "+v.values[0].class.to_s -# end + result = {} - result[:training_dataset_uri] = orig_dataset.split( compounds[0..split], + result[:training_dataset_uri] = orig_dataset.split( training_compounds, orig_dataset.features.keys, { DC.title => "Training dataset split of "+orig_dataset.title.to_s, DC.creator => $url_provider.url_for('/training_test_split',:full) }, subjectid ).uri task.progress(66) if task -# d = Lib::DatasetCache.find(result[:training_dataset_uri]) -# d.data_entries.values.each do |v| -# puts v.inspect -# puts v.values[0].to_s+" "+v.values[0].class.to_s -# end -# raise "stop here" - -# result[:test_dataset_uri] = orig_dataset.create_new_dataset( compounds[(split+1)..-1], -# orig_dataset.features.dclone - [prediction_feature], -# "Test dataset split of "+orig_dataset.title.to_s, -# $sinatra.url_for('/training_test_split',:full) ) - result[:test_dataset_uri] = orig_dataset.split( compounds[(split+1)..-1], + result[:test_dataset_uri] = orig_dataset.split( test_compounds, orig_dataset.features.keys.dclone - [prediction_feature], { DC.title => "Test dataset split of "+orig_dataset.title.to_s, DC.creator => $url_provider.url_for('/training_test_split',:full) }, subjectid ).uri task.progress(100) if task - if ENV['RACK_ENV'] =~ /test|debug/ + if !stratified and ENV['RACK_ENV'] =~ /test|debug/ raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid -- cgit v1.2.3 From eda4b9687ca4b093b7f194b6d0b2e58ce7eed3b2 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 18 Jan 2012 18:56:48 +0100 Subject: wrap plain split into taks --- validation/validation_application.rb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index cda09fa..279cd14 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -543,13 +543,16 @@ post '/cleanup_datasets/?' do end post '/plain_training_test_split' do - LOGGER.info "creating pure training test split "+params.inspect - raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri] + LOGGER.info "creating pure training test split "+params.inspect + raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri] + task = OpenTox::Task.create( "Create data-split", url_for("/plain_training_test_split", :full) ) do |task| strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], @subjectid, strat, params[:split_ratio], params[:random_seed]) content_type "text/uri-list" result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n" + end + return_task(task) end post '/validate_datasets' do -- cgit v1.2.3 From 63320057e2a2b2121c5c405c31e2e7b709fa9e44 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 31 Jan 2012 12:37:36 +0100 Subject: adapt validation to r-util (split), add new r plots --- lib/merge.rb | 10 ++-- lib/r-util.rb | 82 -------------------------- lib/stratification.R | 123 --------------------------------------- report/environment.rb | 13 +++++ report/plot_factory.rb | 55 +++++++++++++++++ report/report_content.rb | 85 ++++++++++++++++++++++++++- report/report_factory.rb | 74 ++++++++++++++++++----- report/report_service.rb | 4 +- report/statistical_test.rb | 49 +--------------- report/validation_access.rb | 60 ++++++++++++++++--- report/validation_data.rb | 23 +++++++- validation/validation_service.rb | 74 +++++++++++------------ 12 files changed, 325 insertions(+), 327 deletions(-) delete mode 100644 lib/r-util.rb delete mode 100644 lib/stratification.R diff --git a/lib/merge.rb b/lib/merge.rb index f30a3c1..bc6e1a7 100644 --- a/lib/merge.rb +++ b/lib/merge.rb @@ -31,6 +31,11 @@ module Lib return merge_count(object)>1 end + def self.merge_count( object ) + @@merge_count[object] = 1 if @@merge_count[object]==nil + return @@merge_count[object] + end + def self.merge_objects( object1, object2 ) raise "classes not equal : "+object1.class.to_s+" != "+object2.class.to_s if object1.class != object2.class object_class = object1.class @@ -137,11 +142,6 @@ module Lib {:value => value, :variance => variance } end - def self.merge_count( object ) - @@merge_count[object] = 1 if @@merge_count[object]==nil - return @@merge_count[object] - end - def self.set_merge_count(object, merge_count) @@merge_count[object] = merge_count end diff --git a/lib/r-util.rb b/lib/r-util.rb deleted file mode 100644 index 0d58389..0000000 --- a/lib/r-util.rb +++ /dev/null @@ -1,82 +0,0 @@ -# pending: package dir hack --------- -# CONFIG[:base_dir] = "/home//opentox-ruby/www" -# PACKAGE_DIR = "/home//opentox-ruby/r-packages" -package_dir = CONFIG[:base_dir].split("/") -package_dir[-1] = "r-packages" -package_dir = package_dir.join("/") -PACKAGE_DIR = package_dir - - - -module Lib - - module RUtil - - def self.dataset_to_dataframe( dataset ) - LOGGER.debug "convert dataset to dataframe #{dataset.uri}" - all_features = [] - dataset.features.each do |f| - feat_name = "feature_#{f[0].split("/")[-1]}" - LOGGER.debug "- adding feature: #{feat_name}" - feat = OpenTox::Feature.find(f[0]) - nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature) - values = [] - dataset.compounds.each do |c| - val = dataset.data_entries[c][f[0]] - raise "not yet implemented" if val!=nil && val.size>1 - v = val==nil ? "" : val[0].to_s - v = "NA" if v.size()==0 - values << v - end - all_features << feat_name - @@r.assign feat_name,values - @@r.eval "#{feat_name} <- as.numeric(#{feat_name})" unless nominal - end - df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" - cmd = "#{df_name} <- data.frame(#{all_features.join(",")})" - @@r.eval cmd - #@@r.eval "head(#{df_name})" - df_name - end - - def self.stratified_split( dataframe, pct=0.3, seed=42 ) - @@r.eval "set.seed(#{seed})" - @@r.eval "split <- stratified_split(#{dataframe}, ratio=#{pct})" - split = @@r.pull 'split' - split.collect{|s| s.to_i} - end - - def self.package_installed?( package ) - @@r.eval ".libPaths(\"#{PACKAGE_DIR}\")" - p = @@r.pull "installed.packages()[,1]" - p.include?(package) - end - - def self.install_packages( package ) - unless package_installed? package - @@r.eval "install.packages(\"#{package}\", repos=\"http://cran.r-project.org\", dependencies=T, lib=\"#{PACKAGE_DIR}\")" - end - end - - def self.library( package ) - install_packages( package ) - @@r.eval "library(\"#{package}\")" - end - - def self.init_r - @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r - library("sampling") - library("gam") - @@r.eval "source(\"#{PACKAGE_DIR}/stratification.R\")" - end - - def self.quit_r - begin - @@r.quit - @@r = nil - rescue - end - end - - end -end diff --git a/lib/stratification.R b/lib/stratification.R deleted file mode 100644 index 9aa8d1f..0000000 --- a/lib/stratification.R +++ /dev/null @@ -1,123 +0,0 @@ -library("sampling") -library("gam") - -nominal_to_binary <- function( orig_data ) -{ - data = as.data.frame( orig_data ) - result = NULL - for (i in 1:ncol(data)) - { - #print(i) - if (is.numeric( data[,i] ) ) - { - if (is.null(result)) - result = data.frame(data[,i]) - else - result = data.frame(result, data[,i]) - colnames(result)[ncol(result)] <- colnames(data)[i] - } - else - { - vals = unique(data[,i]) - for (j in 1:length(vals)) - { - #print(j) - bins = c() - for (k in 1:nrow(data)) - { - if(data[,i][k] == vals[j]) - bins = c(bins,1) - else - bins = c(bins,0) - } - #print(bins) - if (is.null(result)) - result = data.frame(bins) - else - result = data.frame(result, bins) - colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j]) - if (length(vals)==2) break - } - } - } - result -} - -process_data <- function( data ) -{ - if (!is.numeric(data)) - data.num = nominal_to_binary(data) - else - data.num = data - if(any(is.na(data.num))) - data.repl = na.gam.replace(data.num) - else - data.repl = data.num - data.repl -} - -stratified_split <- function( data, ratio=0.3 ) -{ - data.processed = as.matrix(process_data( data )) - pik = rep(ratio,times=nrow(data.processed)) - data.strat = cbind(pik,data.processed) - samplecube(data.strat,pik,order=2,comment=F) -} - -stratified_k_fold_split <- function( data, num_folds=10 ) -{ - print(paste(num_folds,"-fold-split, data-size",nrow(data))) - data.processed = as.matrix(process_data( data )) - folds = rep(0, times=nrow(data)) - for (i in 1:(num_folds-1)) - { - prop = 1/(num_folds-(i-1)) - print(paste("fold",i,"/",num_folds," prop",prop)) - pik = rep(prop,times=nrow(data)) - for (j in 1:nrow(data)) - if(folds[j]!=0) - pik[j]=0 - data.strat = cbind(pik,data.processed) - s<-samplecube(data.strat,pik,order=2,comment=F) - print(paste("fold size: ",sum(s))) - for (j in 1:nrow(data)) - if (s[j] == 1) - folds[j]=i - } - for (j in 1:nrow(data)) - if (folds[j] == 0) - folds[j]=num_folds - folds -} - -plot_split <- function( data, split ) -{ - data.processed = process_data( data ) - data.pca <- prcomp(data.processed, scale=TRUE) - data.2d =as.data.frame(data.pca$x)[1:2] - plot( NULL, - xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), - xlab = "pc 1", ylab = "pc 2") - for (j in 0:max(split)) - { - set = c() - for (i in 1:nrow(data)) - if (split[i] == j) - set = c(set,i) - points(data.2d[set,], pch = 2, col=(j+1)) - } -} - -#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) -#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) -#data<-rbind(a,b) -#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) -#data<-rbind(data,c) -#data=iris -#split = stratified_k_fold_split(data, num_folds=3) -#split = stratified_split(data, ratio=0.3) -#plot_split(data,split) - - - - diff --git a/report/environment.rb b/report/environment.rb index 34554f7..7addc45 100755 --- a/report/environment.rb +++ b/report/environment.rb @@ -6,6 +6,19 @@ end gem 'ruby-plot', "~>0.6.0" module Reports + + def self.r_util + @@r_util = OpenTox::RUtil.new unless defined?@@r_util and @@r_util + @@r_util + end + + def self.quit_r + if defined?@@r_util and @@r_util + @@r_util.quit_r + @@r_util = nil + end + end + end require "lib/ot_predictions.rb" diff --git a/report/plot_factory.rb b/report/plot_factory.rb index 2d7946f..f114dd3 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -100,6 +100,34 @@ module Reports omit_count end + def self.create_train_test_plot( out_files, validation_set, only_prediction_feature, waiting_task ) + if only_prediction_feature + train = [] + test = [] + validation_set.validations.each do |v| + [[v.test_dataset_uri, test, v.test_target_dataset_uri], + [v.training_dataset_uri, train, v.training_dataset_uri]].each do |uri,array,uri2| + d = Lib::DatasetCache.find(uri, validation_set.validations[0].subjectid) + d2 = Lib::DatasetCache.find((uri2 ? uri2 : uri), validation_set.validations[0].subjectid) + d.compounds.each do |c| + d2.data_entries[c][v.prediction_feature].each do |val| + array << val + end if d2.data_entries[c] and d2.data_entries[c][v.prediction_feature] + end + end + end + waiting_task.progress(50) if waiting_task + + numerical = validation_set.unique_feature_type=="regression" + Reports::r_util.double_hist_plot(out_files, train, test, numerical, numerical, "Training Data", "Test Data", + "Prediction Feature Distribution", validation_set.validations.first.prediction_feature ) + else + Reports::r_util.feature_value_plot(out_files, validation_set.validations[0].training_feature_dataset_uri, + validation_set.validations[0].test_feature_dataset_uri, "Training Data", "Test Data", + nil, true, validation_set.validations[0].subjectid, waiting_task ) + end + end + # creates a roc plot (result is plotted into out_file) # * if (split_set_attributes == nil?) @@ -193,6 +221,33 @@ module Reports end end + def self.create_box_plot( out_files, validation_set, title_attribute, value_attribute, class_value ) + + out_files = [out_files] unless out_files.is_a?(Array) + LOGGER.debug "creating box plot, out-files:"+out_files.inspect + + data = {} + validation_set.validations.each do |v| + value = v.send(value_attribute) + if value.is_a?(Hash) + if class_value==nil + avg_value = 0 + value.values.each{ |val| avg_value+=val } + value = avg_value/value.values.size.to_f + else + raise "box plot value is hash, but no entry for class-value ("+class_value.to_s+ + "); value for "+value_attribute.to_s+" -> "+value.inspect unless value.key?(class_value) + value = value[class_value] + end + end + + data[v.send(title_attribute).to_s] = [] unless data[v.send(title_attribute).to_s] + data[v.send(title_attribute).to_s] << value + end + + Reports::r_util.boxplot( out_files, data) + end + def self.create_bar_plot( out_files, validation_set, title_attribute, value_attributes ) out_files = [out_files] unless out_files.is_a?(Array) diff --git a/report/report_content.rb b/report/report_content.rb index 3d92b52..80473c5 100755 --- a/report/report_content.rb +++ b/report/report_content.rb @@ -61,7 +61,6 @@ class Reports::ReportContent test_matrix[:num_results].to_s, table, true, true) end end - Reports::ReportStatisticalTest.quit_r end def add_predictions( validation_set, @@ -183,6 +182,39 @@ class Reports::ReportContent align_last_two_images section_title+" in logarithmic and linear scale (values <= 0 are omitted in logarithmic scale)" end + def add_train_test_plot( validation_set, + only_prediction_feature, + waiting_task, + section_title="Training Test Distribution Plot", + section_text=nil, + image_title=nil) + + section_plot = @current_section + prediction_set = validation_set.collect{ |v| v.get_predictions } + @xml_report.add_paragraph(section_plot, section_text) if section_text + + begin + plot_png = add_tmp_file("train_test_plot_#{only_prediction_feature}", "png") + plot_svg = add_tmp_file("train_test_plot_#{only_prediction_feature}", "svg") + omit_count = Reports::PlotFactory.create_train_test_plot( [plot_png[:path], plot_svg[:path]], + prediction_set, only_prediction_feature, waiting_task ) + unless image_title + if only_prediction_feature + image_title = "Prediction Feature: #{validation_set.validations.first.prediction_feature}" + else + image_title = "Features Excluding Prediction Feature" + end + end + @xml_report.add_imagefigure(section_plot, image_title, plot_png[:name], "PNG", 100, plot_svg[:name]) + rescue Exception => ex + LOGGER.error("Could not create train test plot: "+ex.message) + rm_tmp_file(plot_png[:name]) if plot_png + rm_tmp_file(plot_svg[:name]) if plot_svg + @xml_report.add_paragraph(section_plot, "could not create train test plot: "+ex.message) + end + + end + def add_roc_plot( validation_set, accept_value, split_set_attribute=nil, @@ -317,6 +349,57 @@ class Reports::ReportContent @xml_report.add_imagefigure(section_bar, image_title, plot_png[:name], "PNG", 100, plot_svg[:name]) end + def add_box_plot(validation_set, + title_attribute, + value_attributes, + section_title="Boxplots", + section_text=nil) + + section_box = @xml_report.add_section(@current_section, section_title) + @xml_report.add_paragraph(section_box, section_text) if section_text + + plot_png = nil; plot_svg = nil + begin + plot_input = [] + value_attributes.each do |a| + accept = validation_set.get_accept_values_for_attr(a) + if accept and accept.size>0 + accept.each do |c| + title = a.to_s.gsub("_","-") + ( (accept.size==1 || c==nil) ? "" : "("+c.to_s+")" ) + plot_input << [a,c,title] + end + else + plot_input << [a,nil,a.to_s.gsub("_","-")] + end + end + + i = 0 + figs = [] + plot_input.each do |attrib,class_value,image_title| + plot_png = add_tmp_file("box_plot#{i}", "png") + plot_svg = add_tmp_file("box_plot#{i}", "svg") + Reports::PlotFactory.create_box_plot([plot_png[:path], plot_svg[:path]], + validation_set, title_attribute, attrib, class_value ) + figs << @xml_report.imagefigure(image_title, plot_png[:name], + "PNG", 50, plot_svg[:name]) + plot_png = nil; plot_svg = nil + i += 1 + end + + i = 1 + figs.each_slice(4) do |f| + @xml_report.add_imagefigures_in_row(section_box,f,"Boxplots #{i}") + i+=1 + end + rescue Exception => ex + msg = "WARNING could not create box plot: "+ex.message + LOGGER.error(msg) + rm_tmp_file(plot_png[:name]) if plot_png + rm_tmp_file(plot_svg[:name]) if plot_svg + @xml_report.add_paragraph(section_box, msg) + end + end + private def add_tmp_file(name, extension) tmp_file_name = name.to_s+@tmp_file_count.to_s+"."+extension.to_s diff --git a/report/report_factory.rb b/report/report_factory.rb index 2b978c5..2bb74ee 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -11,10 +11,10 @@ VAL_ATTR_REGR = [ :num_instances, :num_unpredicted, :root_mean_squared_error, :weighted_root_mean_squared_error, :mean_absolute_error, :weighted_mean_absolute_error, :r_square, :weighted_r_square, :sample_correlation_coefficient ] -#VAL_ATTR_BAR_PLOT_CLASS = [ :accuracy, :average_area_under_roc, +#VAL_ATTR_BOX_PLOT_CLASS = [ :accuracy, :average_area_under_roc, # :area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate ] -VAL_ATTR_BAR_PLOT_CLASS = [ :accuracy, :f_measure, :true_positive_rate, :true_negative_rate, :positive_predictive_value, :negative_predictive_value ] -VAL_ATTR_BAR_PLOT_REGR = [ :root_mean_squared_error, :mean_absolute_error, :r_square ] +VAL_ATTR_BOX_PLOT_CLASS = [ :accuracy, :area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate, :positive_predictive_value, :negative_predictive_value ] +VAL_ATTR_BOX_PLOT_REGR = [ :root_mean_squared_error, :mean_absolute_error, :r_square ] VAL_ATTR_TTEST_REGR = [ :r_square, :root_mean_squared_error ] VAL_ATTR_TTEST_CLASS = [ :accuracy, :average_area_under_roc ] @@ -29,8 +29,9 @@ module Reports::ReportFactory RT_VALIDATION = "validation" RT_CV = "crossvalidation" RT_ALG_COMP = "algorithm_comparison" + RT_METHOD_COMP = "method_comparison" - REPORT_TYPES = [RT_VALIDATION, RT_CV, RT_ALG_COMP ] + REPORT_TYPES = [RT_VALIDATION, RT_CV, RT_ALG_COMP, RT_METHOD_COMP ] # creates a report of a certain type according to the validation data in validation_set # @@ -40,11 +41,13 @@ module Reports::ReportFactory def self.create_report(type, validation_set, params={}, task=nil) case type when RT_VALIDATION - create_report_validation(validation_set, task) + create_report_validation(validation_set, {}, task) when RT_CV - create_report_crossvalidation(validation_set, task) + create_report_crossvalidation(validation_set, {}, task) when RT_ALG_COMP create_report_compare_algorithms(validation_set, params, task) + when RT_METHOD_COMP + create_report_compare_methods(validation_set, params, task) else raise "unknown report type "+type.to_s end @@ -78,7 +81,7 @@ module Reports::ReportFactory raise OpenTox::BadRequestError.new("num validations is not equal to 1") unless validation_set.size==1 val = validation_set.validations[0] - pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) + pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,50) ) report = Reports::ReportContent.new("Validation report") add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil @@ -103,7 +106,6 @@ module Reports::ReportFactory report.add_confidence_plot(validation_set, :positive_predictive_value, accept_value) report.align_last_two_images "Confidence Plots" end - report.end_section when "regression" report.add_result(validation_set, [:validation_uri] + VAL_ATTR_TRAIN_TEST + VAL_ATTR_REGR, "Results", "Results") report.add_section("Plots") @@ -111,10 +113,13 @@ module Reports::ReportFactory report.add_confidence_plot(validation_set, :root_mean_squared_error, nil) report.add_confidence_plot(validation_set, :r_square, nil) report.align_last_two_images "Confidence Plots" - report.end_section end - task.progress(90) if task - + task.progress(70) if task + report.add_train_test_plot( validation_set, false, OpenTox::SubTask.create(task,70,80) ) + report.add_train_test_plot( validation_set, true, OpenTox::SubTask.create(task,80,90) ) + report.align_last_two_images "Training Test Data Distribution Plots" + report.end_section + report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") report.add_predictions( validation_set ) task.progress(100) if task @@ -248,11 +253,11 @@ module Reports::ReportFactory when "classification" result_attributes += VAL_ATTR_CLASS ttest_attributes = VAL_ATTR_TTEST_CLASS - bar_plot_attributes = VAL_ATTR_BAR_PLOT_CLASS + box_plot_attributes = VAL_ATTR_BOX_PLOT_CLASS else result_attributes += VAL_ATTR_REGR ttest_attributes = VAL_ATTR_TTEST_REGR - bar_plot_attributes = VAL_ATTR_BAR_PLOT_REGR + box_plot_attributes = VAL_ATTR_BOX_PLOT_REGR end if params[:ttest_attributes] and params[:ttest_attributes].chomp.size>0 @@ -263,8 +268,8 @@ module Reports::ReportFactory ttest_significance = params[:ttest_significance].to_f end - bar_plot_attributes += ttest_attributes - bar_plot_attributes.uniq! + box_plot_attributes += ttest_attributes + box_plot_attributes.uniq! result_attributes += ttest_attributes result_attributes.uniq! @@ -287,13 +292,50 @@ module Reports::ReportFactory res_text = "These performance statistics have been derieved by computing the mean of the statistics on each crossvalidation fold." report.add_result(merged,result_attributes,res_titel,res_titel,res_text) # pending: regression stats have different scales!!! - report.add_bar_plot(merged, :identifier, bar_plot_attributes) if validation_set.unique_feature_type=="classification" + report.add_box_plot(set, :identifier, box_plot_attributes) report.add_paired_ttest_tables(set, :identifier, ttest_attributes, ttest_significance) if ttest_significance>0 report.end_section end task.progress(100) if task report end + + def self.create_report_compare_methods(validation_set, params={}, task=nil) + raise OpenTox::BadRequestError.new("num validations is not >1") unless validation_set.size>1 + raise OpenTox::BadRequestError.new("validations must have unique feature type, i.e. must be either all regression, "+ + "or all classification validations") unless validation_set.unique_feature_type + raise OpenTox::BadRequestError.new("number of different identifiers <2: "+ + validation_set.get_values(:identifier).inspect) if validation_set.num_different_values(:identifier)<2 + #validation_set.load_cv_attributes + + pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) + report = Reports::ReportContent.new("Method comparison report") + add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil + + result_attributes = [:identifier,:validation_uri,:validation_report_uri]+VAL_ATTR_CV-[:crossvalidation_fold,:num_folds,:dataset_uri] + case validation_set.unique_feature_type + when "classification" + result_attributes += VAL_ATTR_CLASS + box_plot_attributes = VAL_ATTR_BOX_PLOT_CLASS + else + result_attributes += VAL_ATTR_REGR + box_plot_attributes = VAL_ATTR_BOX_PLOT_REGR + end + + merged = validation_set.merge([:identifier]) + merged.sort(:identifier) + + merged.validations.each do |v| + v.validation_uri = v.validation_uri.split(";").uniq.join(" ") + v.validation_report_uri = v.validation_report_uri.split(";").uniq.join(" ") if v.validation_report_uri + end + + msg = merged.validations.collect{|v| v.identifier+" ("+Lib::MergeObjects.merge_count(v).to_s+"x)"}.join(", ") + report.add_result(merged,result_attributes,"Average Results","Results",msg) + + report.add_box_plot(validation_set, :identifier, box_plot_attributes) + report + end end diff --git a/report/report_service.rb b/report/report_service.rb index 53a17ab..f315b04 100644 --- a/report/report_service.rb +++ b/report/report_service.rb @@ -89,7 +89,9 @@ module Reports report_content = Reports::ReportFactory.create_report(type, validation_set, params, OpenTox::SubTask.create(task,10,90)) LOGGER.debug "report created" - + Reports::quit_r + Reports.validation_access.delete_tmp_resources(subjectid) + #step 3: persist report if creation not failed id = @@persistance.new_report(report_content, type, create_meta_data(type, validation_set, validation_uris), self, subjectid) LOGGER.debug "report persisted with id: '"+id.to_s+"'" diff --git a/report/statistical_test.rb b/report/statistical_test.rb index 4d85555..da46f6b 100644 --- a/report/statistical_test.rb +++ b/report/statistical_test.rb @@ -1,38 +1,6 @@ #require "rubygems" #require "rinruby" -module LIB - class StatisticalTest - - # -1 -> array1 < array2 - # 0 -> not difference - # 1 -> array2 > array1 - # - def self.pairedTTest(array1, array2, significance_level=0.95) - - @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r - @@r.assign "v1",array1 - @@r.assign "v2",array2 - @@r.eval "ttest = t.test(v1,v2,paired=T)" - t = @@r.pull "ttest$statistic" - p = @@r.pull "ttest$p.value" - if (1-significance_level > p) - t - else - 0 - end - end - - def self.quit_r - begin - @@r.quit - @@r = nil - rescue - end - end - end -end - module Reports class ReportStatisticalTest @@ -72,27 +40,12 @@ module Reports array1 = validations1.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } array2 = validations2.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } LOGGER.debug "paired-t-testing "+attribute.to_s+" "+array1.inspect+" vs "+array2.inspect - LIB::StatisticalTest.pairedTTest(array1, array2, significance_level) + Reports::r_util.paired_ttest(array1, array2, significance_level) end - def self.quit_r - LIB::StatisticalTest.quit_r - end - end end -#x=["1.36840891838074", "2.89500403404236", "2.58440494537354", "1.96544003486633", "1.4017288684845", "1.68250012397766", "1.65089893341064", "2.24862003326416", "3.73909902572632", "2.36335206031799"] -#y=["1.9675121307373", "2.30981087684631", "2.59359288215637", "2.62243509292603", "1.98700189590454", "2.26789593696594", "2.03917217254639", "2.69466996192932", "1.96487307548523", "1.65820598602295"] -#puts LIB::StatisticalTest.pairedTTest(x,y) -# -##t1 = Time.new -##10.times do -# puts LIB::StatisticalTest.pairedTTest([1.01,2,3,4,5,12,4,2],[2,3,3,3,56,3,4,5]) -##end -#LIB::StatisticalTest.quit_r -##t2 = Time.new -##puts t2-t1 diff --git a/report/validation_access.rb b/report/validation_access.rb index 2cb0c6f..aaa7bdc 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -7,6 +7,8 @@ require "lib/validation_db.rb" # class Reports::ValidationDB + @@tmp_resources = [] + def same_service?(uri) self_uri = URI.parse($url_provider.url) val_uri = URI.parse(uri) @@ -132,27 +134,67 @@ class Reports::ValidationDB validation.send("#{p.to_s}=".to_sym, cv.send(p.to_s)) end end + + def training_feature_dataset_uri(validation, subjectid) + m = OpenTox::Model::Generic.find(validation.model_uri, subjectid) + if m + f = m.metadata[OT.featureDataset] + return f.chomp if f + end + raise "no feature dataset found" + end + def test_feature_dataset_uri(validation, subjectid) + m = OpenTox::Model::Generic.find(validation.model_uri, subjectid) + feat_gen = nil + m.metadata[OT.parameters].each do |h| + if h[DC.title] and h[DC.title]=~/feature_generation/ and h[OT.paramValue] + feat_gen = h[OT.paramValue] + break + end + end if m and m.metadata[OT.parameters] + raise "no feature creation alg found" unless feat_gen + feat_gen = File.join(feat_gen,"match") if feat_gen=~/fminer/ + uri = OpenTox::RestClientWrapper.post(feat_gen,{:subjectid => subjectid, + :feature_dataset_uri=>training_feature_dataset_uri(validation,subjectid), + :dataset_uri=>validation.test_dataset_uri}) + @@tmp_resources << uri + uri + end + + def delete_tmp_resources(subjectid) + @@tmp_resources.each do |uri| + OpenTox::RestClientWrapper.delete uri,{:subjectid=>subjectid} + end + @@tmp_resources = [] + end + def get_predictions(validation, filter_params, subjectid, task) # we need compound info, cannot reuse stored prediction data data = Lib::PredictionData.create( validation.feature_type, validation.test_dataset_uri, validation.test_target_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri, - validation.predicted_variable, validation.predicted_confidence, subjectid, task ) + validation.predicted_variable, validation.predicted_confidence, subjectid, OpenTox::SubTask.create(task, 0, 80 ) ) data = Lib::PredictionData.filter_data( data.data, data.compounds, filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions] ) if filter_params!=nil + task.progress(100) if task Lib::OTPredictions.new( data.data, data.compounds ) end def get_accept_values( validation, subjectid=nil ) # PENDING So far, one has to load the whole dataset to get the accept_value from ambit - test_target_dataset = validation.test_target_dataset_uri - test_target_dataset = validation.test_dataset_uri unless test_target_dataset - d = Lib::DatasetCache.find( test_target_dataset, subjectid ) - raise "cannot get test target dataset for accept values, dataset: "+test_target_dataset.to_s unless d - accept_values = d.accept_values(validation.prediction_feature) - raise "cannot get accept values from dataset "+test_target_dataset.to_s+" for feature "+ - validation.prediction_feature+":\n"+d.features[validation.prediction_feature].to_yaml unless accept_values!=nil - accept_values + test_target_datasets = validation.test_target_dataset_uri + test_target_datasets = validation.test_dataset_uri unless test_target_datasets + res = nil + test_target_datasets.split(";").each do |test_target_dataset| + d = Lib::DatasetCache.find( test_target_dataset, subjectid ) + raise "cannot get test target dataset for accept values, dataset: "+test_target_dataset.to_s unless d + accept_values = d.accept_values(validation.prediction_feature) + raise "cannot get accept values from dataset "+test_target_dataset.to_s+" for feature "+ + validation.prediction_feature+":\n"+d.features[validation.prediction_feature].to_yaml unless accept_values!=nil + raise "different accept values" if res && res!=accept_values + res = accept_values + end + res end def feature_type( validation, subjectid=nil ) diff --git a/report/validation_data.rb b/report/validation_data.rb index eea5229..3806fd7 100755 --- a/report/validation_data.rb +++ b/report/validation_data.rb @@ -94,6 +94,7 @@ module Reports @subjectid = subjectid raise unless filter_params==nil || filter_params.is_a?(Hash) @filter_params = filter_params + @created_resources = [] #raise "subjectid is nil" unless subjectid end @@ -102,7 +103,22 @@ module Reports Reports.validation_access.init_validation_from_cv_statistics(v, cv_uri, filter_params, subjectid) v end - + + def training_feature_dataset_uri + unless @training_feature_dataset + @training_feature_dataset = Reports.validation_access.training_feature_dataset_uri( self, @subjectid ) + end + @training_feature_dataset + end + + #hack this does create the features for the test dataset + def test_feature_dataset_uri + unless @test_feature_dataset + @test_feature_dataset = Reports.validation_access.test_feature_dataset_uri( self, @subjectid ) + end + @test_feature_dataset + end + # returns/creates predictions, cache to save rest-calls/computation time # # call-seq: @@ -402,12 +418,17 @@ module Reports end if variance + #puts "variance given #{a}, #{val.inspect}, #{val.class}, #{variance.inspect}, #{variance.class}" if (val.is_a?(Array)) raise "not implemented" elsif (val.is_a?(Hash)) val.collect{ |i,j| i.to_nice_s+": "+j.to_nice_s + " +- " + variance[i].to_nice_s }.join(", ") else + if (variance.is_a?(Hash)) + raise "invalid variance" unless accept_values.size==1 && accept_values[0]!=nil + variance = variance[accept_values[0]] + end val.to_nice_s + " +- " + variance.to_nice_s end else diff --git a/validation/validation_service.rb b/validation/validation_service.rb index dceead9..614363d 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -2,7 +2,6 @@ require "lib/validation_db.rb" require "lib/ot_predictions.rb" -require "lib/r-util.rb" require "validation/validation_format.rb" @@ -639,16 +638,10 @@ module Validation end if stratified - Lib::RUtil.init_r - df = Lib::RUtil.dataset_to_dataframe( orig_dataset ) - split = Lib::RUtil.stratified_split( df, split_ratio, random_seed ) - Lib::RUtil.quit_r - raise "internal error" unless split.size==orig_dataset.compounds.size - task.progress(33) if task - - training_compounds = [] - split.size.times{|i| training_compounds << orig_dataset.compounds[i] if split[i]==1} - test_compounds = orig_dataset.compounds - training_compounds + r_util = OpenTox::RUtil.new + split_sets = r_util.stratified_split( orig_dataset, "NA", df, split_ratio, random_seed ) + r_util.quit_r + result = {:training_dataset_uri => split_sets[0], :test_dataset_uri => split_sets[1]} else compounds = orig_dataset.compounds raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 @@ -661,37 +654,36 @@ module Validation compounds.shuffle!( random_seed ) training_compounds = compounds[0..split] test_compounds = compounds[(split+1)..-1] + task.progress(33) if task + + result = {} + result[:training_dataset_uri] = orig_dataset.split( training_compounds, + orig_dataset.features.keys, + { DC.title => "Training dataset split of "+orig_dataset.title.to_s, + DC.creator => $url_provider.url_for('/training_test_split',:full) }, + subjectid ).uri + task.progress(66) if task + + result[:test_dataset_uri] = orig_dataset.split( test_compounds, + orig_dataset.features.keys.dclone - [prediction_feature], + { DC.title => "Test dataset split of "+orig_dataset.title.to_s, + DC.creator => $url_provider.url_for('/training_test_split',:full) }, + subjectid ).uri + task.progress(100) if task + + if !stratified and ENV['RACK_ENV'] =~ /test|debug/ + raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless + Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) + test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid + raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data + test_data.load_compounds subjectid + raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+ + test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split) + end + + LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" end - task.progress(33) if task - - result = {} - - result[:training_dataset_uri] = orig_dataset.split( training_compounds, - orig_dataset.features.keys, - { DC.title => "Training dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri - task.progress(66) if task - - result[:test_dataset_uri] = orig_dataset.split( test_compounds, - orig_dataset.features.keys.dclone - [prediction_feature], - { DC.title => "Test dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri - task.progress(100) if task - - if !stratified and ENV['RACK_ENV'] =~ /test|debug/ - raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless - Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) - test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid - raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data - test_data.load_compounds subjectid - raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+ - test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split) - end - - LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" - return result + result end end -- cgit v1.2.3 From f2933ba1b51ba5dc28bc649d0919d6d7dce14721 Mon Sep 17 00:00:00 2001 From: rautenberg Date: Tue, 6 Mar 2012 16:10:20 +0100 Subject: subjectid from helper --- validation/validation_application.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 279cd14..32ca971 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -51,7 +51,7 @@ post '/crossvalidation/?' do cv_params = { :dataset_uri => params[:dataset_uri], :algorithm_uri => params[:algorithm_uri], :loo => "false", - :subjectid => params[:subjectid] } + :subjectid => @subjectid } [ :num_folds, :random_seed ].each{ |sym| cv_params[sym] = params[sym] if params[sym] } cv_params[:stratified] = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] cv = Validation::Crossvalidation.create cv_params -- cgit v1.2.3