From ca2bb0f90335b1f2c4ecc28ee423e85b281ffcf0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 Nov 2015 17:50:17 +0100 Subject: neighbor search delegated to database backend --- lib/classification.rb | 6 ++--- lib/compound.rb | 52 ++++++++++++++++++++++++++------------------ lib/crossvalidation.rb | 4 ++-- lib/dataset.rb | 21 ++++++++++++++---- lib/descriptor.rb | 1 - lib/model.rb | 31 +++++++++++++------------- lib/regression.rb | 37 ++++++++++--------------------- test/compound.rb | 14 ++++++++++-- test/dataset-long.rb | 1 + test/dataset.rb | 6 ++--- test/fminer-long.rb | 3 +++ test/lazar-classification.rb | 42 +++++++++++++++++++++++++++++++++++ test/lazar-fminer.rb | 1 + test/lazar-long.rb | 23 +++++++++++++++++++- test/lazar-regression.rb | 4 ++-- test/prediction_models.rb | 11 +--------- test/validation.rb | 26 +++++++++++++--------- 17 files changed, 181 insertions(+), 102 deletions(-) create mode 100644 test/lazar-classification.rb diff --git a/lib/classification.rb b/lib/classification.rb index b4b2e59..7a225bb 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -5,14 +5,12 @@ module OpenTox def self.weighted_majority_vote compound, params neighbors = params[:neighbors] - return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty? weighted_sum = {} sim_sum = 0.0 confidence = 0.0 neighbors.each do |row| - n,sim,acts = row - #confidence = sim if sim > confidence # distance to nearest neighbor - acts.each do |act| + sim = row["tanimoto"] + row["features"][params[:prediction_feature_id].to_s].each do |act| weighted_sum[act] ||= 0 weighted_sum[act] += sim end diff --git a/lib/compound.rb b/lib/compound.rb index a26528b..c5e7f02 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -23,13 +23,16 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer + field :dataset_ids, type: Array, default: [] + field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) + #index({default_fingerprint: 1}, {unique: false}) # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params compound = self.find_or_initialize_by params - compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT) + compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size compound.save compound end @@ -41,7 +44,7 @@ module OpenTox if type == "MP2D" fp = obconversion(smiles,"smi","mpd").strip.split("\t") name = fp.shift # remove Title - fingerprints[type] = fp + fingerprints[type] = fp.uniq # no fingerprint counts #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html elsif type== "MNA" level = 2 # TODO: level as parameter, evaluate level 1, see paper @@ -244,20 +247,23 @@ module OpenTox def fingerprint_neighbors params bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] neighbors = [] - #if params[:type] == DEFAULT_FINGERPRINT - #neighbors = db_neighbors params - #p neighbors - #else + if params[:type] == DEFAULT_FINGERPRINT + neighbors = db_neighbors params + else query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| - unless self == compound + training_dataset = Dataset.find(params[:training_dataset_id]) + prediction_feature = training_dataset.features.first + training_dataset.compounds.each do |compound| + #unless self == compound candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << [compound.id, sim] if sim >= params[:min_sim] - end + feature_values = training_dataset.values(compound,prediction_feature) + neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + #end end - #end - neighbors.sort{|a,b| b.last <=> a.last} + neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} + end + neighbors end def fminer_neighbors params @@ -299,30 +305,34 @@ module OpenTox end def db_neighbors params - p "DB NEIGHBORS" - p params - # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb - qn = fingerprint(params[:type]).size + + #qn = default_fingerprint_size #qmin = qn * threshold #qmax = qn / threshold #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] aggregate = [ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, - {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self + #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self {'$project' => { 'tanimoto' => {'$let' => { - 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}}, - 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]} + 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, + #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}}, + 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, - '_id' => 1 + '_id' => 1, + 'features' => 1, + 'dataset_ids' => 1 }}, {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, {'$sort' => {'tanimoto' => -1}} ] - $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } + $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + + + #$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 2e6dabb..3127351 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -136,7 +136,7 @@ module OpenTox incorrect_predictions = 0 predictions.each do |p| if p[1] and p[2] - p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 + p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f confidences << p[3] @@ -243,7 +243,7 @@ module OpenTox :neighbors => neighbors } end - end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1] + end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] end def confidence_plot diff --git a/lib/dataset.rb b/lib/dataset.rb index d989bdf..af116a9 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -105,10 +105,18 @@ module OpenTox test_cids = test_idxs.collect{|i| self.compound_ids[i]} test_data_entries = test_idxs.collect{|i| self.data_entries[i]} test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) + test_dataset.compounds.each do |compound| + compound.dataset_ids << test_dataset.id + compound.save + end training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| self.compound_ids[i]} training_data_entries = training_idxs.collect{|i| self.data_entries[i]} training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) + training_dataset.compounds.each do |compound| + compound.dataset_ids << training_dataset.id + compound.save + end test_dataset.save_all training_dataset.save_all chunks << [training_dataset,test_dataset] @@ -229,7 +237,7 @@ module OpenTox table.each_with_index do |vals,i| ct = Time.now - identifier = vals.shift + identifier = vals.shift.strip warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format @@ -246,7 +254,7 @@ module OpenTox warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - # TODO insert empty compounds to keep positions? + compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 @@ -263,10 +271,15 @@ module OpenTox warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] - self.data_entries.last[j] = v.to_f + v = v.to_f else - self.data_entries.last[j] = v.strip + v = v.strip end + self.data_entries.last[j] = v + #i = compound.feature_ids.index feature_ids[j] + compound.features[feature_ids[j].to_s] ||= [] + compound.features[feature_ids[j].to_s] << v + compound.save end end compounds.duplicates.each do |compound| diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 9733bde..93ce591 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -107,7 +107,6 @@ module OpenTox des[lib] << descriptor end des.each do |lib,descriptors| - p lib, descriptors send(lib, descriptors) end serialize diff --git a/lib/model.rb b/lib/model.rb index 227d4d3..44b36e6 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -76,22 +76,23 @@ module OpenTox t = Time.new neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) - # add activities - # TODO: improve efficiency, takes 3 times longer than previous version - neighbors.collect! do |n| - rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} - acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact - acts.empty? ? nil : n << acts - end - neighbors.compact! # remove neighbors without training activities + # remove neighbors without prediction_feature + # check for database activities (neighbors may include query compound) + database_activities = nil + if neighbors.collect{|n| n["_id"]}.include? compound.id - database_activities = training_dataset.values(compound,prediction_feature) - if use_database_values and database_activities and !database_activities.empty? - database_activities = database_activities.first if database_activities.size == 1 - predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} - next + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s] + neighbors.delete_if{|n| n["_id"] == compound.id} + end + neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } + if neighbors.empty? + prediction = {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} + else + prediction = Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id}) end - predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size}) + prediction[:database_activities] = database_activities + predictions << prediction + =begin # TODO scaled dataset for physchem p neighbor_algorithm_parameters @@ -126,7 +127,7 @@ module OpenTox warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]} + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]} prediction_dataset.save_all return prediction_dataset end diff --git a/lib/regression.rb b/lib/regression.rb index 868c25f..575a1ef 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,39 +1,26 @@ -# TODO install R packages kernlab, caret, doMC, class, e1071 - - - # log transform activities (create new dataset) - # scale, normalize features, might not be necessary - # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is - # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression - # zero-order correlation and the semi-partial correlation - # seems to be necessary for svm - # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 - # http://stackoverflow.com/questions/15436367/svm-scaling-input-values - # use lasso or elastic net?? - # select relevant features - # remove features with a single value - # remove correlated features - # remove features not correlated with endpoint module OpenTox module Algorithm class Regression def self.weighted_average compound, params + #p params.keys weighted_sum = 0.0 sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] activities = [] neighbors.each do |row| - n,sim,acts = row - confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors - acts.each do |act| - weighted_sum += sim*Math.log10(act) - activities << act - sim_sum += sim - end + #if row["dataset_ids"].include? params[:training_dataset_id] + sim = row["tanimoto"] + confidence = sim if sim > confidence # distance to nearest neighbor + # TODO add LOO errors + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + activities << act + sim_sum += sim + end + #end end #R.assign "activities", activities #R.eval "cv = cv(activities)" @@ -47,10 +34,8 @@ module OpenTox end def self.local_linear_regression compound, neighbors - p neighbors.size return nil unless neighbors.size > 0 features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq - p features training_data = Array.new(neighbors.size){Array.new(features.size,0)} neighbors.each_with_index do |n,i| #p n.first diff --git a/test/compound.rb b/test/compound.rb index 22c152b..ff20c1c 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -162,7 +162,7 @@ print c.sdf end def test_fingerprint_db_neighbors - skip + #skip training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv") [ "CC(=O)CC(C)C#N", @@ -170,8 +170,18 @@ print c.sdf "C(=O)CC(C)C#N", ].each do |smi| c = OpenTox::Compound.from_smiles smi + t = Time.now neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2) - p neighbors + p Time.now - t + t = Time.now + neighbors2 = c.fingerprint_neighbors({:type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.2}) + p Time.now - t + p neighbors.size + p neighbors2.size + #p neighbors + #p neighbors2 + #p neighbors2 - neighbors + #assert_equal neighbors, neighbors2 end end end diff --git a/test/dataset-long.rb b/test/dataset-long.rb index 5c8dfb8..49b61df 100644 --- a/test/dataset-long.rb +++ b/test/dataset-long.rb @@ -86,6 +86,7 @@ class DatasetLongTest < MiniTest::Test end def test_upload_feature_dataset + skip t = Time.now f = File.join DATA_DIR, "rat_feature_dataset.csv" d = Dataset.from_csv_file f diff --git a/test/dataset.rb b/test/dataset.rb index 4f1e885..1814081 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -127,7 +127,7 @@ class DatasetTest < MiniTest::Test original_csv.shift csv.each_with_index do |row,i| compound = Compound.from_smiles row.shift - original_compound = Compound.from_smiles original_csv[i].shift + original_compound = Compound.from_smiles original_csv[i].shift.strip assert_equal original_compound.inchi, compound.inchi row.each_with_index do |v,j| if v.numeric? @@ -142,7 +142,6 @@ class DatasetTest < MiniTest::Test def test_from_csv d = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - p d assert_equal Dataset, d.class assert_equal 1, d.features.size assert_equal 85, d.compounds.size @@ -170,8 +169,7 @@ class DatasetTest < MiniTest::Test def test_from_csv2 File.open("#{DATA_DIR}/temp_test.csv", "w+") { |file| file.write("SMILES,Hamster\nCC=O,true\n ,true\nO=C(N),true") } dataset = Dataset.from_csv_file "#{DATA_DIR}/temp_test.csv" - p dataset.warnings - assert_equal "Cannot parse SMILES compound ' ' at position 3, all entries are ignored.", dataset.warnings.join + assert_equal "Cannot parse SMILES compound '' at position 3, all entries are ignored.", dataset.warnings.join File.delete "#{DATA_DIR}/temp_test.csv" dataset.features.each{|f| feature = Feature.find f.id; feature.delete} dataset.delete diff --git a/test/fminer-long.rb b/test/fminer-long.rb index 0f202b4..845ed71 100644 --- a/test/fminer-long.rb +++ b/test/fminer-long.rb @@ -3,6 +3,7 @@ require_relative "setup.rb" class FminerTest < MiniTest::Test def test_fminer_multicell + skip #skip "multicell segfaults" # TODO aborts, probably fminer # or OpenBabel segfault @@ -15,6 +16,7 @@ class FminerTest < MiniTest::Test end def test_fminer_isscan + skip dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv") feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15) assert_equal feature_dataset.compounds.size, dataset.compounds.size @@ -25,6 +27,7 @@ class FminerTest < MiniTest::Test end def test_fminer_kazius + skip dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") # TODO reactivate default settings feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20) diff --git a/test/lazar-classification.rb b/test/lazar-classification.rb new file mode 100644 index 0000000..e8b2181 --- /dev/null +++ b/test/lazar-classification.rb @@ -0,0 +1,42 @@ +require_relative "setup.rb" + +class LazarClassificationTest < MiniTest::Test + + def test_lazar_classification + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::LazarClassification.create training_dataset#, feature_dataset + #assert_equal 'C-C-C=C', feature_dataset.features.first.smarts + + [ { + :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :prediction => "false", + :confidence => 0.25281385281385277, + :nr_neighbors => 11 + },{ + :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), + :prediction => "false", + :confidence => 0.3639589577089577, + :nr_neighbors => 14 + } ].each do |example| + prediction = model.predict example[:compound] + assert_equal example[:prediction], prediction[:value] + #assert_equal example[:confidence], prediction[:confidence] + #assert_equal example[:nr_neighbors], prediction[:neighbors].size + end + + compound = Compound.from_smiles "CCO" + prediction = model.predict compound + assert_equal ["false"], prediction[:database_activities] + assert_equal "true", prediction[:value] + + # make a dataset prediction + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv") + prediction = model.predict compound_dataset + assert_equal compound_dataset.compounds, prediction.compounds + + assert_equal "Cound not find similar compounds.", prediction.data_entries[7][2] + assert_equal "measured", prediction.data_entries[14][1] + # cleanup + [training_dataset,model,compound_dataset].each{|o| o.delete} + end +end diff --git a/test/lazar-fminer.rb b/test/lazar-fminer.rb index 41e1071..9e024a1 100644 --- a/test/lazar-fminer.rb +++ b/test/lazar-fminer.rb @@ -3,6 +3,7 @@ require_relative "setup.rb" class LazarFminerTest < MiniTest::Test def test_lazar_fminer + skip training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::LazarFminerClassification.create training_dataset#, feature_dataset feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id] diff --git a/test/lazar-long.rb b/test/lazar-long.rb index 92d7d5a..525b96e 100644 --- a/test/lazar-long.rb +++ b/test/lazar-long.rb @@ -3,6 +3,7 @@ require_relative "setup.rb" class LazarExtendedTest < MiniTest::Test def test_lazar_bbrc_ham_minfreq + skip dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::LazarFminerClassification.create(dataset, :min_frequency => 5) feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id] @@ -21,6 +22,7 @@ class LazarExtendedTest < MiniTest::Test end def test_lazar_bbrc_large_ds + skip dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call_no_dup.csv") model = Model::LazarFminerClassification.create dataset feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id] @@ -44,7 +46,8 @@ class LazarExtendedTest < MiniTest::Test feature_dataset.delete end - def test_lazar_kazius + def test_lazar_fminer_kazius + skip t = Time.now dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") p "Dataset upload: #{Time.now-t}" @@ -68,4 +71,22 @@ class LazarExtendedTest < MiniTest::Test #feature_dataset.delete end + def test_lazar_kazius + t = Time.now + dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") + p "Dataset upload: #{Time.now-t}" + t = Time.now + model = Model::LazarClassification.create(dataset) + p "Feature mining: #{Time.now-t}" + t = Time.now + 2.times do + compound = Compound.from_smiles("Clc1ccccc1NN") + prediction = model.predict compound + #p prediction + assert_equal "1", prediction[:value] + #assert_in_delta 0.019858401199860445, prediction[:confidence], 0.001 + end + dataset.delete + end + end diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb index 4f5a332..c1dc9b9 100644 --- a/test/lazar-regression.rb +++ b/test/lazar-regression.rb @@ -8,7 +8,7 @@ class LazarRegressionTest < MiniTest::Test compound = Compound.from_smiles "CC(C)(C)CN" prediction = model.predict compound assert_equal 7.2, prediction[:value].round(1) - assert_equal 91, prediction[:neighbors].size + assert_equal 88, prediction[:neighbors].size end def test_mpd_fingerprints @@ -17,7 +17,7 @@ class LazarRegressionTest < MiniTest::Test model.neighbor_algorithm_parameters[:type] = "MP2D" compound = Compound.from_smiles "CCCSCCSCC" prediction = model.predict compound - assert_equal 0.02, prediction[:value].round(2) + assert_equal 0.04, prediction[:value].round(2) assert_equal 3, prediction[:neighbors].size end diff --git a/test/prediction_models.rb b/test/prediction_models.rb index 1b9e788..067c3c8 100644 --- a/test/prediction_models.rb +++ b/test/prediction_models.rb @@ -4,22 +4,13 @@ class PredictionModelTest < MiniTest::Test def test_prediction_model pm = Model::Prediction.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - #dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - #model = Model::LazarFminerClassification.create dataset - #cv = ClassificationCrossValidation.create model - #metadata = JSON.parse(File.read("#{DATA_DIR}/hamster_carcinogenicity.json")) - - #metadata[:model_id] = model.id - #metadata[:crossvalidation_id] = cv.id - #pm = Model::Prediction.new(metadata) - #pm.save [:endpoint,:species,:source].each do |p| refute_empty pm[p] end assert pm.classification? refute pm.regression? pm.crossvalidations.each do |cv| - assert cv.accuracy > 0.75 + assert cv.accuracy > 0.75, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split." end prediction = pm.predict Compound.from_smiles("CCCC(NN)C") assert_equal "true", prediction[:value] diff --git a/test/validation.rb b/test/validation.rb index 6764a32..7de944c 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -3,6 +3,7 @@ require_relative "setup.rb" class ValidationTest < MiniTest::Test def test_fminer_crossvalidation + skip dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::LazarFminerClassification.create dataset cv = ClassificationCrossValidation.create model @@ -15,12 +16,13 @@ class ValidationTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::LazarClassification.create dataset#, features cv = ClassificationCrossValidation.create model - assert cv.accuracy > 0.7 - File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot} - `inkview tmp.svg` + #p cv + assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7" + #File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot} + #`inkview tmp.svg` p cv.nr_unpredicted p cv.accuracy - #assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy." + assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than unweighted accuracy (#{cv.accuracy}) ." end def test_default_regression_crossvalidation @@ -28,11 +30,11 @@ class ValidationTest < MiniTest::Test model = Model::LazarRegression.create dataset cv = RegressionCrossValidation.create model #cv = RegressionCrossValidation.find '561503262b72ed54fd000001' - p cv.id - File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot} - `inkview tmp.svg` - File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot} - `inkview tmp.svg` + #p cv.id + #File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot} + #`inkview tmp.svg` + #File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot} + #`inkview tmp.svg` #puts cv.misclassifications.to_yaml p cv.rmse @@ -91,9 +93,13 @@ class ValidationTest < MiniTest::Test model.save cv = ClassificationCrossValidation.create model params = model.neighbor_algorithm_parameters + params.delete :training_dataset_id params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string + cv.validations.each do |validation| - assert_equal params, validation.model.neighbor_algorithm_parameters + validation_params = validation.model.neighbor_algorithm_parameters + validation_params.delete "training_dataset_id" + assert_equal params, validation_params end end -- cgit v1.2.3