summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgebele <gebele@in-silico.ch>2017-05-26 12:53:01 +0000
committergebele <gebele@in-silico.ch>2017-05-26 12:53:01 +0000
commit6ed197736516d98e200cc64d922f42eb3122589c (patch)
treed2c7de76b020be254cc82563d36e6711fd6f1867
parent61a7d994d8f4fbcf25414beea96189bf885ad19d (diff)
parent9aa5203dd375225996c1efe4be1a4324ddc6cda7 (diff)
Merge branch 'development'
-rw-r--r--lib/caret.rb11
-rw-r--r--lib/classification.rb5
-rw-r--r--lib/crossvalidation.rb1
-rw-r--r--lib/dataset.rb2
-rw-r--r--lib/leave-one-out-validation.rb1
-rw-r--r--lib/model.rb48
-rw-r--r--lib/regression.rb2
-rw-r--r--lib/train-test-validation.rb2
-rw-r--r--lib/unique_descriptors.rb3
-rw-r--r--lib/validation-statistics.rb8
-rw-r--r--test/dataset.rb2
-rw-r--r--test/feature.rb10
-rw-r--r--test/model-classification.rb8
-rw-r--r--test/model-regression.rb28
-rw-r--r--test/model-validation.rb2
-rw-r--r--test/setup.rb2
-rw-r--r--test/validation-classification.rb2
-rw-r--r--test/validation-regression.rb2
18 files changed, 84 insertions, 55 deletions
diff --git a/lib/caret.rb b/lib/caret.rb
index f5c2bde..8bccf74 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -22,12 +22,11 @@ module OpenTox
end
if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
elsif
dependent_variables.size < 3
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
-
+ prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
else
dependent_variables.each_with_index do |v,i|
dependent_variables[i] = to_r(v)
@@ -52,7 +51,7 @@ module OpenTox
$logger.debug dependent_variables
$logger.debug independent_variables
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
+ prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
return prediction
end
begin
@@ -73,12 +72,12 @@ module OpenTox
$logger.debug "R caret prediction error for:"
$logger.debug self.inspect
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
+ prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
return prediction
end
if prediction.nil? or prediction[:value].nil?
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
+ prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
end
end
prediction
diff --git a/lib/classification.rb b/lib/classification.rb
index 638492b..a875903 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -18,6 +18,11 @@ module OpenTox
class_weights.each do |a,w|
probabilities[a] = w.sum/weights.sum
end
+ # DG: hack to ensure always two probability values
+ if probabilities.keys.uniq.size == 1
+ missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
+ probabilities[missing_key] = 0.0
+ end
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
p_max = probabilities.collect{|a,p| p}.max
prediction = probabilities.key(p_max)
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 75c5db5..06a1e2a 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -90,6 +90,7 @@ module OpenTox
field :within_prediction_interval, type: Integer, default:0
field :out_of_prediction_interval, type: Integer, default:0
field :correlation_plot_id, type: BSON::ObjectId
+ field :warnings, type: Array
end
# Independent repeated crossvalidations
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 44690e1..6e7d67f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -46,7 +46,7 @@ module OpenTox
if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
data_entries[substance.to_s][feature.to_s]
else
- nil
+ [nil]
end
end
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 8d22018..c33c92b 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -58,6 +58,7 @@ module OpenTox
field :within_prediction_interval, type: Integer, default:0
field :out_of_prediction_interval, type: Integer, default:0
field :correlation_plot_id, type: BSON::ObjectId
+ field :warnings, type: Array
end
end
diff --git a/lib/model.rb b/lib/model.rb
index b18610d..475a346 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -57,7 +57,7 @@ module OpenTox
model.version = {:warning => "git is not installed"}
end
- # set defaults
+ # set defaults#
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
@@ -68,10 +68,6 @@ module OpenTox
:method => "fingerprint",
:type => "MP2D",
},
- :similarity => {
- :method => "Algorithm::Similarity.tanimoto",
- :min => 0.1
- },
:feature_selection => nil
}
@@ -79,9 +75,17 @@ module OpenTox
model.algorithms[:prediction] = {
:method => "Algorithm::Classification.weighted_majority_vote",
}
+ model.algorithms[:similarity] = {
+ :method => "Algorithm::Similarity.tanimoto",
+ :min => 0.1,
+ }
elsif model.class == LazarRegression
model.algorithms[:prediction] = {
- :method => "Algorithm::Caret.pls",
+ :method => "Algorithm::Caret.rf",
+ }
+ model.algorithms[:similarity] = {
+ :method => "Algorithm::Similarity.tanimoto",
+ :min => 0.5,
}
end
@@ -93,7 +97,7 @@ module OpenTox
},
:similarity => {
:method => "Algorithm::Similarity.weighted_cosine",
- :min => 0.5
+ :min => 0.5,
},
:prediction => {
:method => "Algorithm::Caret.rf",
@@ -141,7 +145,6 @@ module OpenTox
end
model.descriptor_ids = model.fingerprints.flatten.uniq
model.descriptor_ids.each do |d|
- # resulting model may break BSON size limit (e.g. f Kazius dataset)
model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
end
# calculate physchem properties
@@ -191,7 +194,7 @@ module OpenTox
# Predict a substance (compound or nanoparticle)
# @param [OpenTox::Substance]
# @return [Hash]
- def predict_substance substance
+ def predict_substance substance, threshold = self.algorithms[:similarity][:min]
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
@@ -221,20 +224,19 @@ module OpenTox
bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
end
- prediction = {}
+ prediction = {:warnings => [], :measurements => []}
+ prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
neighbor_ids = []
neighbor_similarities = []
neighbor_dependent_variables = []
neighbor_independent_variables = []
- prediction = {}
# find neighbors
substance_ids.each_with_index do |s,i|
# handle query substance
if substance.id.to_s == s
- prediction[:measurements] ||= []
prediction[:measurements] << dependent_variables[i]
- prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
+ prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
else
if fingerprints?
neighbor_descriptors = fingerprints[i]
@@ -243,7 +245,7 @@ module OpenTox
neighbor_descriptors = scaled_variables.collect{|v| v[i]}
end
sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
- if sim >= algorithms[:similarity][:min]
+ if sim >= threshold
neighbor_ids << s
neighbor_similarities << sim
neighbor_dependent_variables << dependent_variables[i]
@@ -258,17 +260,27 @@ module OpenTox
measurements = nil
if neighbor_similarities.empty?
- prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
+ prediction[:value] = nil
+ prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
elsif neighbor_similarities.size == 1
- prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
+ prediction[:value] = nil
+ prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
+ prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
else
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
# call prediction algorithm
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
prediction.merge! result
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
+ #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
+ #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
+ #end
+ end
+ if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
+ prediction
+ else # try again with a lower threshold
+ predict_substance substance, 0.2
end
- prediction
end
# Predict a substance (compound or nanoparticle), an array of substances or a dataset
@@ -300,7 +312,7 @@ module OpenTox
# serialize result
if object.is_a? Substance
prediction = predictions[substances.first.id.to_s]
- prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
+ prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity
return prediction
elsif object.is_a? Array
return predictions
diff --git a/lib/regression.rb b/lib/regression.rb
index fd2855f..25c0732 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -17,7 +17,7 @@ module OpenTox
sim_sum += weights[i]
end if dependent_variables
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
- {:value => prediction}
+ {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
end
end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
index 034ae3a..9a5532d 100644
--- a/lib/train-test-validation.rb
+++ b/lib/train-test-validation.rb
@@ -27,6 +27,8 @@ module OpenTox
end
end
predictions.select!{|cid,p| p[:value] and p[:measurements]}
+ # hack to avoid mongos file size limit error on large datasets
+ #predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i)
validation = self.new(
:model_id => validation_model.id,
:test_dataset_id => test_set.id,
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
index 8341a67..fc10cd4 100644
--- a/lib/unique_descriptors.rb
+++ b/lib/unique_descriptors.rb
@@ -48,7 +48,8 @@ UNIQUEDESCRIPTORS = [
#"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
#"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
"Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
- "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
+ # TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable.
+ #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
"Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
"Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
"Cdk.LargestChain", #Returns the number of atoms in the largest chain
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 2d522ae..69e7992 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -111,6 +111,7 @@ module OpenTox
# Get statistics
# @return [Hash]
def statistics
+ self.warnings = []
self.rmse = 0
self.mae = 0
self.within_prediction_interval = 0
@@ -132,8 +133,10 @@ module OpenTox
end
end
else
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ trd_id = model.training_dataset_id
+ smiles = Compound.find(cid).smiles
+ self.warnings << "No training activities for #{smiles} in training dataset #{trd_id}."
+ $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
end
end
R.assign "measurement", x
@@ -146,6 +149,7 @@ module OpenTox
$logger.debug "RMSE #{rmse}"
$logger.debug "MAE #{mae}"
$logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
+ $logger.debug "#{warnings}"
save
{
:mae => mae,
diff --git a/test/dataset.rb b/test/dataset.rb
index e91e65a..055a029 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -160,7 +160,7 @@ class DatasetTest < MiniTest::Test
if v.numeric?
assert_equal v.to_f, serialized[inchi][i].to_f
else
- assert_equal v, serialized[inchi][i]
+ assert_equal v.to_s, serialized[inchi][i].to_s
end
end
diff --git a/test/feature.rb b/test/feature.rb
index 40edb9f..85ce588 100644
--- a/test/feature.rb
+++ b/test/feature.rb
@@ -57,20 +57,20 @@ class FeatureTest < MiniTest::Test
def test_physchem_description
assert_equal 346, PhysChem.descriptors.size
assert_equal 15, PhysChem.openbabel_descriptors.size
- assert_equal 295, PhysChem.cdk_descriptors.size
+ assert_equal 286, PhysChem.cdk_descriptors.size
assert_equal 45, PhysChem.joelib_descriptors.size
- assert_equal 310, PhysChem.unique_descriptors.size
+ assert_equal 309, PhysChem.unique_descriptors.size
end
def test_physchem
assert_equal 346, PhysChem.descriptors.size
c = Compound.from_smiles "CC(=O)CC(C)C"
logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
- assert_equal 1.6215, logP.calculate(c)
+ assert_equal 1.6215, c.calculate_properties([logP]).first
jlogP = PhysChem.find_or_create_by :name => "Joelib.LogP"
- assert_equal 3.5951, jlogP.calculate(c)
+ assert_equal 3.5951, c.calculate_properties([jlogP]).first
alogP = PhysChem.find_or_create_by :name => "Cdk.ALOGP.ALogP"
- assert_equal 0.35380000000000034, alogP.calculate(c)
+ assert_equal 0.35380000000000034, c.calculate_properties([alogP]).first
end
end
diff --git a/test/model-classification.rb b/test/model-classification.rb
index 1424f6a..f75598b 100644
--- a/test/model-classification.rb
+++ b/test/model-classification.rb
@@ -46,12 +46,14 @@ class LazarClassificationTest < MiniTest::Test
assert_equal compound_dataset.compounds, prediction_dataset.compounds
cid = prediction_dataset.compounds[7].id.to_s
- assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
+ assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0]
+ expectations = ["Cannot create prediction: Only one similar compound in the training set.",
+ "Could not find similar substances with experimental data in the training dataset."]
prediction_dataset.predictions.each do |cid,pred|
- assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil?
+ assert_includes expectations, pred[:warnings][0] if pred[:value].nil?
end
cid = Compound.from_smiles("CCOC(=O)N").id.to_s
- assert_match "excluded", prediction_dataset.predictions[cid][:warning]
+ assert_match "excluded", prediction_dataset.predictions[cid][:info]
# cleanup
[training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
end
diff --git a/test/model-regression.rb b/test/model-regression.rb
index 86b927c..5903e88 100644
--- a/test/model-regression.rb
+++ b/test/model-regression.rb
@@ -10,21 +10,21 @@ class LazarRegressionTest < MiniTest::Test
},
:similarity => {
:method => "Algorithm::Similarity.tanimoto",
- :min => 0.1
+ :min => 0.5
},
:prediction => {
- :method => "Algorithm::Caret.pls",
+ :method => "Algorithm::Caret.rf",
},
:feature_selection => nil,
}
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
model = Model::Lazar.create training_dataset: training_dataset
assert_kind_of Model::LazarRegression, model
assert_equal algorithms, model.algorithms
- substance = training_dataset.substances[10]
+ substance = training_dataset.substances[145]
prediction = model.predict substance
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
- substance = Compound.from_smiles "NC(=O)OCCC"
+ substance = Compound.from_smiles "c1ccc(cc1)Oc1ccccc1"
prediction = model.predict substance
refute_nil prediction[:value]
refute_nil prediction[:prediction_interval]
@@ -59,8 +59,8 @@ class LazarRegressionTest < MiniTest::Test
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
compound = Compound.from_smiles "CCCSCCSCC"
prediction = model.predict compound
- assert_equal 4, prediction[:neighbors].size
- assert_equal 1.37, prediction[:value].round(2)
+ assert_equal 3, prediction[:neighbors].size
+ assert prediction[:value].round(2) > 1.37, "Prediction value (#{prediction[:value].round(2)}) should be larger than 1.37."
end
def test_local_physchem_regression
@@ -112,12 +112,12 @@ class LazarRegressionTest < MiniTest::Test
:method => "Algorithm::Similarity.cosine",
}
}
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
assert_kind_of Model::LazarRegression, model
- assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
- assert_equal 0.1, model.algorithms[:similarity][:min]
+ assert_equal 0.5, model.algorithms[:similarity][:min]
algorithms[:descriptors].delete :features
assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
prediction = model.predict training_dataset.substances[10]
@@ -130,14 +130,14 @@ class LazarRegressionTest < MiniTest::Test
:method => "Algorithm::FeatureSelection.correlation_filter",
},
}
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv")
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
assert_kind_of Model::LazarRegression, model
- assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
- assert_equal 0.1, model.algorithms[:similarity][:min]
+ assert_equal 0.5, model.algorithms[:similarity][:min]
assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
- prediction = model.predict training_dataset.substances[10]
+ prediction = model.predict training_dataset.substances[145]
refute_nil prediction[:value]
end
diff --git a/test/model-validation.rb b/test/model-validation.rb
index 83986d6..9304232 100644
--- a/test/model-validation.rb
+++ b/test/model-validation.rb
@@ -12,7 +12,7 @@ class ValidationModelTest < MiniTest::Test
m.crossvalidations.each do |cv|
assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
end
- prediction = m.predict Compound.from_smiles("CCCC(NN)C")
+ prediction = m.predict Compound.from_smiles("OCC(CN(CC(O)C)N=O)O")
assert_equal "true", prediction[:value]
m.delete
end
diff --git a/test/setup.rb b/test/setup.rb
index 40c8ebf..c1cddfb 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -3,6 +3,8 @@ require 'minitest/autorun'
require_relative '../lib/lazar.rb'
#require 'lazar'
include OpenTox
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs # recreate GridFS indexes
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
diff --git a/test/validation-classification.rb b/test/validation-classification.rb
index fb4c3e7..ce06063 100644
--- a/test/validation-classification.rb
+++ b/test/validation-classification.rb
@@ -47,7 +47,7 @@ class ValidationClassificationTest < MiniTest::Test
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::Lazar.create training_dataset: dataset
loo = ClassificationLeaveOneOut.create model
- assert_equal 14, loo.nr_unpredicted
+ assert_equal 24, loo.nr_unpredicted
refute_empty loo.confusion_matrix
assert loo.accuracy > 0.77
assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
diff --git a/test/validation-regression.rb b/test/validation-regression.rb
index 01ed644..c5ad312 100644
--- a/test/validation-regression.rb
+++ b/test/validation-regression.rb
@@ -84,7 +84,7 @@ class ValidationRegressionTest < MiniTest::Test
repeated_cv = RepeatedCrossValidation.create model
repeated_cv.crossvalidations.each do |cv|
assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
- assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
+ assert cv.rmse < 0.5, "RMSE (#{cv.rmse}) should be smaller than 0.5"
end
end