From 4348eec89033e6677c9f628646fc67bd03c73fe6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 6 Oct 2016 19:14:10 +0200 Subject: nano caret regression fixed --- lib/lazar.rb | 1 + lib/model.rb | 64 ++++++------- lib/regression.rb | 220 ------------------------------------------- lib/train-test-validation.rb | 5 +- test/all.rb | 2 +- test/model.rb | 31 ++---- test/nanoparticles.rb | 81 +++++++++------- test/validation.rb | 61 ++++++------ 8 files changed, 110 insertions(+), 355 deletions(-) diff --git a/lib/lazar.rb b/lib/lazar.rb index d0f05c0..f251379 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -83,6 +83,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation"," "model.rb", "classification.rb", "regression.rb", + "caret.rb", "validation-statistics.rb", "validation.rb", "train-test-validation.rb", diff --git a/lib/model.rb b/lib/model.rb index a272580..290309a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -23,10 +23,12 @@ module OpenTox # explicit prediction algorithm if algorithms[:prediction] and algorithms[:prediction][:method] case algorithms[:prediction][:method] - when /Classifiction/ + when /Classification/i model = LazarClassification.new - when /Regression/ + when /Regression/i model = LazarRegression.new + else + bad_request_error "Prediction method '#{algorithms[:prediction][:method]}' not implemented." end # guess model type @@ -36,6 +38,10 @@ module OpenTox model = LazarClassification.new end + model.prediction_feature_id = prediction_feature.id + model.training_dataset_id = training_dataset.id + model.name = "#{training_dataset.name} #{prediction_feature.name}" + # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 @@ -60,7 +66,7 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Regression.caret", + :method => "Algorithm::Caret.regression", :parameters => "pls", } end @@ -77,7 +83,7 @@ module OpenTox :min => 0.5 }, :prediction => { - :method => "Algorithm::Regression.caret", + :method => "Algorithm::Caret.regression", :parameters => "rf", }, :feature_selection => { @@ -100,10 +106,6 @@ module OpenTox end end - model.prediction_feature_id = prediction_feature.id - model.training_dataset_id = training_dataset.id - model.name = "#{training_dataset.name} #{prediction_feature.name}" - if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] end @@ -151,8 +153,12 @@ module OpenTox else bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." end - params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors}) - params.delete :method + params = { + :method => algorithms[:prediction][:parameters], + :descriptors => descriptors, + :neighbors => neighbors, + :relevant_features => relevant_features + } result = Algorithm.run algorithms[:prediction][:method], params prediction.merge! result prediction[:neighbors] = neighbors @@ -218,11 +224,9 @@ module OpenTox end class LazarClassification < Lazar - end class LazarRegression < Lazar - end class Prediction @@ -240,7 +244,7 @@ module OpenTox field :leave_one_out_validation_id, type: BSON::ObjectId def predict object - Lazar.find(model_id).predict object + model.predict object end def training_dataset @@ -251,6 +255,10 @@ module OpenTox Lazar.find model_id end + def prediction_feature + model.prediction_feature + end + def repeated_crossvalidation Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end @@ -276,15 +284,8 @@ module OpenTox bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file prediction_model = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file - prediction_feature = training_dataset.features.first - model = nil - if prediction_feature.nominal? - model = LazarClassification.create prediction_feature, training_dataset - elsif prediction_feature.numeric? - model = LazarRegression.create prediction_feature, training_dataset - end + model = Lazar.create training_dataset: training_dataset prediction_model[:model_id] = model.id - prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save @@ -297,26 +298,19 @@ module OpenTox def self.from_json_dump dir, category Import::Enanomapper.import dir - + training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + end prediction_model = self.new( :endpoint => "log2(Net cell association)", :source => "https://data.enanomapper.net/", :species => "A549 human lung epithelial carcinoma cells", :unit => "log2(ug/Mg)" ) - params = { - :feature_selection_algorithm => :correlation_filter, - :feature_selection_algorithm_parameters => {:category => category}, - :neighbor_algorithm => "physchem_neighbors", - :neighbor_algorithm_parameters => {:min_sim => 0.5}, - :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression", - :prediction_algorithm_parameters => {:method => 'rf'}, # random forests - } - training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - prediction_feature = Feature.find_or_create_by(name: "log2(Net cell association)", category: "TOX") - #prediction_feature = Feature.find("579621b84de73e267b414e55") - prediction_model[:prediction_feature_id] = prediction_feature.id - model = Model::LazarRegression.create(prediction_feature, training_dataset, params) + prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first + model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id diff --git a/lib/regression.rb b/lib/regression.rb index 396c9e4..cf6d9cb 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -19,226 +19,6 @@ module OpenTox {:value => prediction} end - def self.caret descriptors:, neighbors:, method: "pls", parameters:nil - values = [] - descriptors = {} - weights = [] - descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort - - neighbors.each do |n| - activities = n["measurements"] - activities.each do |act| - values << act - weights << n["similarity"] - descriptor_ids.each do |id| - descriptors[id] ||= [] - descriptors[id] << n["descriptors"].include?(id) - end - end if activities - end - - variables = [] - data_frame = [values] - - descriptors.each do |k,v| - unless v.uniq.size == 1 - data_frame << v.collect{|m| m ? "T" : "F"} - variables << k - end - end - - if variables.empty? - prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." - prediction - else - substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} - #puts data_frame.to_yaml - prediction = r_model_prediction method, data_frame, variables, weights, substance_features - if prediction.nil? or prediction[:value].nil? - prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) - prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." - prediction - else - prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] - prediction[:value] = prediction[:value] - prediction[:rmse] = prediction[:rmse] - prediction - end - end - - end - - def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" - values = [] - fingerprints = {} - weights = [] - fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort - - neighbors.each do |n| - fingerprint = Substance.find(n["_id"]).fingerprint - activities = n["measurements"] - activities.each do |act| - values << act - weights << n["similarity"] - fingerprint_ids.each do |id| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) - end - end if activities - end - - variables = [] - data_frame = [values] - - fingerprints.each do |k,v| - unless v.uniq.size == 1 - data_frame << v.collect{|m| m ? "T" : "F"} - variables << k - end - end - - if variables.empty? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." - prediction - else - substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction method, data_frame, variables, weights, substance_features - if prediction.nil? or prediction[:value].nil? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - prediction - else - prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] - prediction[:value] = prediction[:value] - prediction[:rmse] = prediction[:rmse] - prediction - end - end - - end - -=begin - def self.physchem_regression substance:, neighbors:, method: "pls" - - activities = [] - weights = [] - pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort - data_frame = [] - data_frame[0] = [] - - neighbors.each_with_index do |n,i| - activities = n["measurements"] - activities.each do |act| - data_frame[0][i] = act - weights << n["similarity"] - n["common_descriptors"].each do |d| - j = pc_ids.index(d[:id])+1 - data_frame[j] ||= [] - data_frame[j][i] = d[:scaled_value] - end - end if activities - (0..pc_ids.size).each do |j| # for R: fill empty values with NA - data_frame[j] ||= [] - data_frame[j][i] ||= "NA" - end - end - - data_frame = data_frame.each_with_index.collect do |r,i| - if r.uniq.size == 1 # remove properties with a single value - r = nil - pc_ids[i-1] = nil # data_frame frame has additional activity entry - end - r - end - data_frame.compact! - pc_ids.compact! - - if pc_ids.empty? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." - prediction - else - query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } - query_descriptors = query_descriptors.each_with_index.collect do |v,i| - unless v - v = nil - data_frame[i] = nil - pc_ids[i] = nil - end - v - end - query_descriptors.compact! - data_frame.compact! - pc_ids.compact! - prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors - if prediction.nil? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - end - p prediction - prediction - end - - end -=end - - def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values - R.assign "weights", training_weights - r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" -=begin -rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) - File.open("tmp.R","w+"){|f| - f.puts "suppressPackageStartupMessages({ - library(iterators,lib=\"#{rlib}\") - library(foreach,lib=\"#{rlib}\") - library(ggplot2,lib=\"#{rlib}\") - library(grid,lib=\"#{rlib}\") - library(gridExtra,lib=\"#{rlib}\") - library(pls,lib=\"#{rlib}\") - library(caret,lib=\"#{rlib}\") - library(doMC,lib=\"#{rlib}\") - registerDoMC(#{NR_CORES}) -})" - - f.puts "data <- #{r_data_frame}\n" - f.puts "weights <- c(#{training_weights.join(', ')})" - f.puts "features <- c(#{training_features.join(', ')})" - f.puts "names(data) <- append(c('activities'),features)" # - f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)" - f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)" - - f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" - f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - f.puts "names(fingerprint) <- features" - f.puts "prediction <- predict(model,fingerprint)" - } -=end - - R.eval "data <- #{r_data_frame}" - R.assign "features", training_features - begin - R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - value = R.eval("prediction").to_f - rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f - r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f - prediction_interval = value-1.96*rmse, value+1.96*rmse - { - :value => value, - :rmse => rmse, - :r_squared => r_squared, - :prediction_interval => prediction_interval - } - rescue - return nil - end - end - end end end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index 286614a..e3f5905 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -9,10 +9,7 @@ module OpenTox def self.create model, training_set, test_set - atts = model.attributes.dup # do not modify attributes of the original model - atts["_id"] = BSON::ObjectId.new - atts[:training_dataset_id] = training_set.id - validation_model = model.class.create model.prediction_feature, training_set, atts + validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms validation_model.save predictions = validation_model.predict test_set.substances nr_unpredicted = 0 diff --git a/test/all.rb b/test/all.rb index a10bcaa..eddf4e6 100644 --- a/test/all.rb +++ b/test/all.rb @@ -1,5 +1,5 @@ # "./default_environment.rb" has to be executed separately -exclude = ["./setup.rb","./all.rb", "./default_environment.rb","./nanoparticles.rb"] +exclude = ["./setup.rb","./all.rb", "./default_environment.rb"] (Dir[File.join(File.dirname(__FILE__),"*.rb")]-exclude).each do |test| require_relative test end diff --git a/test/model.rb b/test/model.rb index 563d081..02b8e73 100644 --- a/test/model.rb +++ b/test/model.rb @@ -13,7 +13,7 @@ class ModelTest < MiniTest::Test :min => 0.1 }, :prediction => { - :method => "Algorithm::Regression.caret", + :method => "Algorithm::Caret.regression", :parameters => "pls", }, :feature_selection => nil, @@ -65,7 +65,7 @@ class ModelTest < MiniTest::Test training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Regression.caret", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method] assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method] assert_equal 0.1, model.algorithms[:similarity][:min] assert_equal algorithms[:descriptors], model.algorithms[:descriptors] @@ -78,7 +78,7 @@ class ModelTest < MiniTest::Test training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first end model = Model::Lazar.create training_dataset: training_dataset - assert_equal "Algorithm::Regression.caret", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method] assert_equal "rf", model.algorithms[:prediction][:parameters] assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method] prediction = model.predict training_dataset.substances[14] @@ -87,6 +87,7 @@ class ModelTest < MiniTest::Test end def test_nanoparticle_parameters + skip end def test_regression_with_feature_selection @@ -98,13 +99,14 @@ class ModelTest < MiniTest::Test training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Regression.caret", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method] assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] assert_equal 0.1, model.algorithms[:similarity][:min] assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method] end def test_caret_parameters + skip end def test_default_classification @@ -153,25 +155,4 @@ class ModelTest < MiniTest::Test assert_equal 4, prediction[:neighbors].size end -=begin - def test_physchem_description - assert_equal 355, PhysChem.descriptors.size - assert_equal 15, PhysChem.openbabel_descriptors.size - assert_equal 295, PhysChem.cdk_descriptors.size - assert_equal 45, PhysChem.joelib_descriptors.size - assert_equal 310, PhysChem.unique_descriptors.size - end - - def test_physchem - assert_equal 355, PhysChem.descriptors.size - c = Compound.from_smiles "CC(=O)CC(C)C" - logP = PhysChem.find_or_create_by :name => "Openbabel.logP" - assert_equal 1.6215, logP.calculate(c) - jlogP = PhysChem.find_or_create_by :name => "Joelib.LogP" - assert_equal 3.5951, jlogP.calculate(c) - alogP = PhysChem.find_or_create_by :name => "Cdk.ALOGP.ALogP" - assert_equal 0.35380000000000034, alogP.calculate(c) - end -=end - end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 23c09e7..9b2d2d9 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -5,29 +5,26 @@ class NanoparticleTest < MiniTest::Test include OpenTox::Validation def setup - # TODO: multiple runs create duplicates - #$mongo.database.drop - #Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first unless @training_dataset Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") @training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first end + @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first end def test_create_model - skip - @training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, @training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors"}) + model = Model::Lazar.create training_dataset: @training_dataset nanoparticle = @training_dataset.nanoparticles[-34] prediction = model.predict nanoparticle + p prediction refute_nil prediction[:value] assert_includes nanoparticle.dataset_ids, @training_dataset.id model.delete end def test_inspect_cv + skip cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last #p cv #p cv.id @@ -45,6 +42,7 @@ class NanoparticleTest < MiniTest::Test end end def test_inspect_worst_prediction + skip cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last worst_predictions = cv.worst_predictions(n: 3,show_neigbors: false) @@ -64,10 +62,8 @@ class NanoparticleTest < MiniTest::Test end def test_validate_model - #feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") - - model = Model::LazarRegression.create(feature, @training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :feature_selection_algorithm => :correlation_filter, :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) + algorithms = { :prediction => {:method => "Algorithm::Regression.weighted_average" } } + model = Model::Lazar.create training_dataset: @training_dataset cv = RegressionCrossValidation.create model p cv.rmse p cv.r_squared @@ -77,17 +73,14 @@ class NanoparticleTest < MiniTest::Test end def test_validate_pls_model - feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") - - model = Model::LazarRegression.create(feature, @training_dataset, { - :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", - :feature_selection_algorithm => :correlation_filter, - :prediction_algorithm_parameters => {:method => 'pls'}, - #:feature_selection_algorithm_parameters => {:category => "P-CHEM"}, - #:feature_selection_algorithm_parameters => {:category => "Proteomics"}, - :neighbor_algorithm => "physchem_neighbors", - :neighbor_algorithm_parameters => {:min_sim => 0.5} - }) + algorithms = { + :descriptors => { + :method => "properties", + :types => ["P-CHEM"] + }, + :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'pls' }, + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms cv = RegressionCrossValidation.create model p cv.rmse p cv.r_squared @@ -96,17 +89,14 @@ class NanoparticleTest < MiniTest::Test end def test_validate_random_forest_model - feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") - - model = Model::LazarRegression.create(feature, @training_dataset, { - :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", - :prediction_algorithm_parameters => {:method => 'rf'}, - :feature_selection_algorithm => :correlation_filter, - #:feature_selection_algorithm_parameters => {:category => "P-CHEM"}, - #:feature_selection_algorithm_parameters => {:category => "Proteomics"}, - :neighbor_algorithm => "physchem_neighbors", - :neighbor_algorithm_parameters => {:min_sim => 0.5} - }) + algorithms = { + :descriptors => { + :method => "properties", + :types => ["P-CHEM"] + }, + :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'rf' } + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms cv = RegressionCrossValidation.create model p cv.rmse p cv.r_squared @@ -115,9 +105,28 @@ class NanoparticleTest < MiniTest::Test end def test_validate_proteomics_pls_model - feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") - - model = Model::LazarRegression.create(feature, @training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "proteomics_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) + algorithms = { + :descriptors => { + :method => "properties", + :types => ["Proteomics"] + }, + :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'rf' } + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + cv = RegressionCrossValidation.create model + p cv.rmse + p cv.r_squared + refute_nil cv.r_squared + refute_nil cv.rmse + end + + def test_validate_all_default_model + algorithms = { + :descriptors => { + :types => ["Proteomics","P-CHEM"] + }, + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms cv = RegressionCrossValidation.create model p cv.rmse p cv.r_squared diff --git a/test/validation.rb b/test/validation.rb index b4f5a92..03adf69 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -7,7 +7,7 @@ class ValidationTest < MiniTest::Test def test_default_classification_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset.features.first, dataset + model = Model::Lazar.create training_dataset: dataset cv = ClassificationCrossValidation.create model assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split" assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})." @@ -15,9 +15,9 @@ class ValidationTest < MiniTest::Test def test_default_regression_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - model = Model::LazarRegression.create dataset.features.first, dataset + model = Model::Lazar.create training_dataset: dataset cv = RegressionCrossValidation.create model - assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be larger than 1.5, this may occur due to an unfavorable training/test set split" + assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be smaller than 1.5, this may occur due to an unfavorable training/test set split" assert cv.mae < 1, "MAE #{cv.mae} should be smaller than 1, this may occur due to an unfavorable training/test set split" end @@ -25,23 +25,20 @@ class ValidationTest < MiniTest::Test def test_classification_crossvalidation_parameters dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - params = { - :neighbor_algorithm_parameters => { - :min_sim => 0.3, - :type => "FP3" - } + algorithms = { + :similarity => { :min => 0.3, }, + :descriptors => { :type => "FP3" } } - model = Model::LazarClassification.create dataset.features.first, dataset, params - model.save + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms cv = ClassificationCrossValidation.create model - params = model.neighbor_algorithm_parameters + params = model.algorithms params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string - + cv.validations.each do |validation| - validation_params = validation.model.neighbor_algorithm_parameters - refute_nil params["dataset_id"] - refute_nil validation_params[:dataset_id] - refute_equal params["dataset_id"], validation_params[:dataset_id] + validation_params = validation.model.algorithms + refute_nil model.training_dataset_id + refute_nil validation.model.training_dataset_id + refute_equal model.training_dataset_id, validation.model.training_dataset_id ["min_sim","type","prediction_feature_id"].each do |k| assert_equal params[k], validation_params[k] end @@ -50,24 +47,20 @@ class ValidationTest < MiniTest::Test def test_regression_crossvalidation_params dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - params = { - :prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", - :neighbor_algorithm => "fingerprint_neighbors", - :neighbor_algorithm_parameters => { - :type => "MACCS", - :min_sim => 0.7, - } + algorithms = { + :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, + :descriptors => { :type => "MACCS", }, + :similarity => {:min => 0.7} } - model = Model::LazarRegression.create dataset.features.first, dataset, params - assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type] + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + assert_equal algorithms[:descriptors][:type], model.algorithms[:descriptors][:type] cv = RegressionCrossValidation.create model cv.validation_ids.each do |vid| model = Model::Lazar.find(Validation.find(vid).model_id) - assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type] - assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim] - refute_nil model[:neighbor_algorithm_parameters][:dataset_id] - refute_equal dataset.id, model[:neighbor_algorithm_parameters][:dataset_id] - assert_equal model.training_dataset_id, model[:neighbor_algorithm_parameters][:dataset_id] + assert_equal algorithms[:descriptors][:type], model.algorithms[:descriptors][:type] + assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] + refute_nil model.training_dataset_id + refute_equal dataset.id, model.training_dataset_id end refute_nil cv.rmse @@ -77,7 +70,7 @@ class ValidationTest < MiniTest::Test def test_physchem_regression_crossvalidation skip # TODO: fix training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - model = Model::LazarRegression.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") + model = Model::Lazar.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") cv = RegressionCrossValidation.create model refute_nil cv.rmse refute_nil cv.mae @@ -87,7 +80,7 @@ class ValidationTest < MiniTest::Test def test_classification_loo_validation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset.features.first, dataset + model = Model::Lazar.create training_dataset: dataset loo = ClassificationLeaveOneOut.create model assert_equal 14, loo.nr_unpredicted refute_empty loo.confusion_matrix @@ -97,7 +90,7 @@ class ValidationTest < MiniTest::Test def test_regression_loo_validation dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - model = Model::LazarRegression.create dataset.features.first, dataset + model = Model::Lazar.create training_dataset: dataset loo = RegressionLeaveOneOut.create model assert loo.r_squared > 0.34, "R^2 (#{loo.r_squared}) should be larger than 0.034" end @@ -106,7 +99,7 @@ class ValidationTest < MiniTest::Test def test_repeated_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset.features.first, dataset + model = Model::Lazar.create training_dataset: dataset repeated_cv = RepeatedCrossValidation.create model repeated_cv.crossvalidations.each do |cv| assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" -- cgit v1.2.3