From 91787edb3682900bc5a2feeca66e5142f387fcc6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 7 Oct 2016 10:25:58 +0200 Subject: unified interface for prediction algorithms --- lib/caret.rb | 152 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/classification.rb | 2 +- lib/crossvalidation.rb | 4 +- lib/dataset.rb | 2 - lib/feature.rb | 18 +++--- lib/import.rb | 3 +- lib/nanoparticle.rb | 50 ---------------- lib/physchem.rb | 6 +- lib/regression.rb | 2 +- test/nanoparticles.rb | 129 ++++++----------------------------------- 10 files changed, 186 insertions(+), 182 deletions(-) create mode 100644 lib/caret.rb diff --git a/lib/caret.rb b/lib/caret.rb new file mode 100644 index 0000000..b999b06 --- /dev/null +++ b/lib/caret.rb @@ -0,0 +1,152 @@ +module OpenTox + module Algorithm + + class Caret + # TODO classification + # model list: https://topepo.github.io/caret/modelList.html + + attr_accessor :descriptors, :neighbors, :method, :relevant_features, :data_frame, :feature_names, :weights, :query_features + + def initialize descriptors:, neighbors:, method:, relevant_features: + @descriptors = descriptors + @neighbors = neighbors + @method = method + @relevant_features = relevant_features + end + + def self.regression descriptors:, neighbors:, method:, relevant_features:nil + + caret = new(descriptors:descriptors, neighbors:neighbors, method:method, relevant_features:relevant_features) + # collect training data for R + if descriptors.is_a? Array + caret.fingerprint2R + elsif descriptors.is_a? Hash + caret.properties2R + else + bad_request_error "Descriptors should be a fingerprint (Array) or properties (Hash). Cannot handle '#{descriptors.class}'." + end + if caret.feature_names.empty? or caret.data_frame.flatten.uniq == ["NA"] + prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + else + prediction = caret.r_model_prediction + if prediction.nil? or prediction[:value].nil? + prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + end + end + prediction + + end + + def fingerprint2R + + values = [] + features = {} + @weights = [] + descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort + + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| + values << act + @weights << n["similarity"] + descriptor_ids.each do |id| + features[id] ||= [] + features[id] << n["descriptors"].include?(id) + end + end if activities + end + + @feature_names = [] + @data_frame = [values] + + features.each do |k,v| + unless v.uniq.size == 1 + @data_frame << v.collect{|m| m ? "T" : "F"} + @feature_names << k + end + end + @query_features = @feature_names.collect{|f| descriptors.include?(f) ? "T" : "F"} + + end + + + def properties2R + + @weights = [] + @feature_names = [] + @query_features = [] + + # keep only descriptors with values + @relevant_features.keys.each_with_index do |f,i| + if @descriptors[f] + @feature_names << f + @query_features << @descriptors[f].median + else + neighbors.each do |n| + n["descriptors"].delete_at i + end + end + end + + measurements = neighbors.collect{|n| n["measurements"]}.flatten + # initialize data frame with 'NA' defaults + @data_frame = Array.new(@feature_names.size+1){Array.new(measurements.size,"NA") } + + i = 0 + # parse neighbor activities and descriptors + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| # multiple measurements are treated as separate instances + unless n["descriptors"].include?(nil) + data_frame[0][i] = act + @weights << n["similarity"] + n["descriptors"].each_with_index do |d,j| + @data_frame[j+1][i] = d + end + i += 1 + end + end if activities # ignore neighbors without measurements + end + + end + + def r_model_prediction + begin + R.assign "weights", @weights + r_data_frame = "data.frame(#{@data_frame.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", @feature_names + R.eval "names(data) <- append(c('activities'),features)" # + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" + rescue => e + $logger.debug "R caret model creation error for:" + $logger.debug JSON.pretty_generate(self.inspect) + return nil + end + begin + R.eval "query <- data.frame(rbind(c(#{@query_features.join ','})))" + R.eval "names(query) <- features" + R.eval "prediction <- predict(model,query)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse + { + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval + } + rescue => e + $logger.debug "R caret prediction error for:" + $logger.debug self.inspect + return nil + end + end + + end + end +end + diff --git a/lib/classification.rb b/lib/classification.rb index 01ba878..6582e7d 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,7 +3,7 @@ module OpenTox class Classification - def self.weighted_majority_vote descriptors:nil, neighbors: + def self.weighted_majority_vote descriptors:nil, neighbors:, method:nil, relevant_features:nil sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index d7a1f08..15d1031 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -16,10 +16,10 @@ module OpenTox folds: n ) cv.save # set created_at + nr_instances = 0 nr_unpredicted = 0 - #predictions = {} - training_dataset = Dataset.find model.training_dataset_id + training_dataset = model.training_dataset training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" diff --git a/lib/dataset.rb b/lib/dataset.rb index 2e21e5b..453fc35 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -165,11 +165,9 @@ module OpenTox feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - metadata["numeric"] = true numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else - metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) diff --git a/lib/feature.rb b/lib/feature.rb index c6fb68a..0ca4d41 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -2,30 +2,28 @@ module OpenTox # Basic feature class class Feature - field :nominal, type: Boolean - field :numeric, type: Boolean field :measured, type: Boolean field :calculated, type: Boolean field :category, type: String field :unit, type: String field :conditions, type: Hash + + def nominal? + self.class == NominalFeature + end + + def numeric? + self.class == NumericFeature + end end # Feature for categorical variables class NominalFeature < Feature field :accept_values, type: Array - def initialize params - super params - nominal = true - end end # Feature for quantitative variables class NumericFeature < Feature - def initialize params - super params - numeric = true - end end # Feature for SMARTS fragments diff --git a/lib/import.rb b/lib/import.rb index 17894a9..8e57401 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -76,7 +76,7 @@ module OpenTox if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source) + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset end else @@ -98,6 +98,7 @@ module OpenTox :category => category, :conditions => effect["conditions"], :source => study["protocol"]["category"]["term"], + :measured => true, :warnings => warnings ) nanoparticle.parse_ambit_value feature, effect["result"], dataset diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 6905f6f..f74f263 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,54 +8,6 @@ module OpenTox attr_accessor :scaled_values -=begin - def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features: - dataset = Dataset.find(dataset_id) - #relevant_features = {} - measurements = [] - substances = [] - # TODO: exclude query activities!!! - dataset.substances.each do |s| - if s.core == self.core # exclude nanoparticles with different core - dataset.values(s,prediction_feature_id).each do |act| - measurements << act - substances << s - end - end - end - neighbors = [] - substances.each do |substance| - values = dataset.values(substance,prediction_feature_id) - if values - common_descriptors = relevant_features.keys & substance.descriptors.keys - # scale values - query_descriptors = common_descriptors.collect{|d| (descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - @scaled_values = common_descriptors.collect{|d| [d,(descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - neighbor_descriptors = common_descriptors.collect{|d| (substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} - weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} - sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) - neighbors << { - "_id" => substance.id, - "measurements" => values, - "similarity" => sim, - "common_descriptors" => common_descriptors.collect do |id| - { - :id => id, - :scaled_value => neighbor_scaled_values[id], - :p_value => relevant_features[id]["p_value"], - :r_squared => relevant_features[id]["r"]**2} - end - } if sim >= min_sim - end - end - $logger.debug "#{self.name}: #{neighbors.size} neighbors" - neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} - neighbors - end -=end - def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand case feature.category @@ -78,8 +30,6 @@ module OpenTox end def parse_ambit_value feature, v, dataset - #p dataset - #p feature # TODO add study id to warnings v.delete "unit" # TODO: ppm instead of weights diff --git a/lib/physchem.rb b/lib/physchem.rb index 86300ba..c32e382 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -42,7 +42,7 @@ module OpenTox def self.descriptors desc=DESCRIPTORS desc.collect do |name,description| lib,desc = name.split('.',2) - self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end @@ -54,11 +54,11 @@ module OpenTox CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n| dname = "#{name}.#{n}" description = DESCRIPTORS[dname] - udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end else description = DESCRIPTORS[name] - udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end udesc diff --git a/lib/regression.rb b/lib/regression.rb index cf6d9cb..0e5e06b 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.weighted_average descriptors:nil, neighbors:, parameters:nil + def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 9b2d2d9..074a429 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -14,57 +14,18 @@ class NanoparticleTest < MiniTest::Test end def test_create_model - model = Model::Lazar.create training_dataset: @training_dataset + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature nanoparticle = @training_dataset.nanoparticles[-34] prediction = model.predict nanoparticle - p prediction refute_nil prediction[:value] assert_includes nanoparticle.dataset_ids, @training_dataset.id + asser_true @prediction_feature.measured model.delete end - def test_inspect_cv - skip - cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last - #p cv - #p cv.id - #cv.correlation_plot_id = nil - File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} - #p cv.statistics - #p cv.model.@training_dataset.substances.first.physchem_descriptors.keys.collect{|d| Feature.find(d).name} - CrossValidation.all.sort_by{|cv| cv.created_at}.reverse.each do |cv| - p cv.name - p cv.created_at - begin - p cv.r_squared - rescue - end - end - end - def test_inspect_worst_prediction - skip - - cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last - worst_predictions = cv.worst_predictions(n: 3,show_neigbors: false) - assert_equal 3, worst_predictions.size - assert_kind_of Integer, worst_predictions.first[:neighbors] - worst_predictions = cv.worst_predictions - assert_equal 5, worst_predictions.size - assert_kind_of Array, worst_predictions.first[:neighbors] - assert_kind_of Integer, worst_predictions.first[:neighbors].first[:common_descriptors] - puts worst_predictions.to_yaml - worst_predictions = cv.worst_predictions(n: 2, show_common_descriptors: true) - #puts worst_predictions.to_yaml - assert_equal 2, worst_predictions.size - assert_kind_of Array, worst_predictions.first[:neighbors] - refute_nil worst_predictions.first[:neighbors].first[:common_descriptors] - #p cv.model.training_dataset.features - end - - def test_validate_model - algorithms = { :prediction => {:method => "Algorithm::Regression.weighted_average" } } - model = Model::Lazar.create training_dataset: @training_dataset - cv = RegressionCrossValidation.create model + def test_validate_default_nanoparticle_model + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature + cv = CrossValidation.create model p cv.rmse p cv.r_squared #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} @@ -72,62 +33,42 @@ class NanoparticleTest < MiniTest::Test refute_nil cv.rmse end - def test_validate_pls_model + def test_validate_pls_nanoparticle_model algorithms = { - :descriptors => { - :method => "properties", - :types => ["P-CHEM"] - }, - :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'pls' }, + :descriptors => { :types => ["P-CHEM"] }, + :prediction => {:parameters => 'pls' }, } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = RegressionCrossValidation.create model + assert_equal "pls", model.algorithms[:prediction][:method] + cv = CrossValidation.create model p cv.rmse p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end - def test_validate_random_forest_model + def test_validate_proteomics_pls_nanoparticle_model algorithms = { - :descriptors => { - :method => "properties", - :types => ["P-CHEM"] - }, - :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'rf' } + :descriptors => { :types => ["Proteomics"] }, + :prediction => { :parameters => 'pls' } } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = RegressionCrossValidation.create model + assert_equal "pls", model.algorithms[:prediction][:method] + cv = CrossValidation.create model p cv.rmse p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end - def test_validate_proteomics_pls_model - algorithms = { - :descriptors => { - :method => "properties", - :types => ["Proteomics"] - }, - :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'rf' } - } - model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = RegressionCrossValidation.create model - p cv.rmse - p cv.r_squared - refute_nil cv.r_squared - refute_nil cv.rmse - end - - def test_validate_all_default_model + def test_validate_all_default_nanoparticle_model algorithms = { :descriptors => { :types => ["Proteomics","P-CHEM"] }, } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = RegressionCrossValidation.create model + cv = CrossValidation.create model p cv.rmse p cv.r_squared refute_nil cv.r_squared @@ -141,42 +82,6 @@ class NanoparticleTest < MiniTest::Test end end - def test_summaries - skip - datasets = Dataset.all - datasets = datasets.select{|d| !d.name.nil?} - datasets.each do |d| - - #p d.features.select{|f| f.name.match (/Total/)} - #p d.features.collect{|f| "#{f.name} #{f.unit} #{f.conditions}"} - p d.features.uniq.collect{|f| f.name} - end - assert_equal 9, datasets.size -=begin - features = Feature.all.to_a - #p features.collect do |f| - #f if f.category == "TOX" - #end.to_a.flatten.size - toxcounts = {} - pccounts = {} - Nanoparticle.all.each do |np| - np.measurements.each do |t,v| - toxcounts[t] ||= 0 - toxcounts[t] += 1#v.uniq.size - end - np.physchem_descriptors.each do |t,v| - pccounts[t] ||= 0 - pccounts[t] += 1#v.uniq.size - end - end - #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml - #pccounts.each{|e,n| p Feature.find(e),n if n > 100} - #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq - toxcounts.each{|e,n| p Feature.find(e),n if n > 100} -=end - end - - def test_import_ld skip dataset_ids = Import::Enanomapper.import_ld -- cgit v1.2.3