From 3a9c9332b660d35720ad4fa1f55ee0883e53aecd Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 2 Nov 2018 20:34:44 +0100 Subject: warnings fixed, cleanup --- lib/caret.rb | 11 +++++----- lib/classification.rb | 2 +- lib/compound.rb | 30 -------------------------- lib/crossvalidation.rb | 5 ----- lib/dataset.rb | 31 ++++++++------------------- lib/import.rb~ | 6 +++--- lib/leave-one-out-validation.rb | 9 +------- lib/model.rb | 16 +++++--------- lib/regression.rb | 2 +- lib/substance.rb | 1 - lib/train-test-validation.rb | 11 ++-------- lib/validation-statistics.rb | 1 + lib/validation.rb | 2 -- test/classification-validation.rb | 6 +++--- test/dataset.rb | 4 ++-- test/regression-validation.rb | 2 +- test/use_cases.rb | 45 +++++++++++++++++++++++---------------- 17 files changed, 62 insertions(+), 122 deletions(-) diff --git a/lib/caret.rb b/lib/caret.rb index 8bccf74..2e5f1bc 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -22,11 +22,11 @@ module OpenTox end if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances." + prediction[:warnings] = ["No variables for regression model, using weighted average of similar substances (no prediction interval available)."] elsif dependent_variables.size < 3 prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances." + prediction[:warnings] = ["Insufficient number of neighbors (#{dependent_variables.size}) for regression model, using weighted average of similar substances (no prediction interval available)."] else dependent_variables.each_with_index do |v,i| dependent_variables[i] = to_r(v) @@ -51,7 +51,8 @@ module OpenTox $logger.debug dependent_variables $logger.debug independent_variables prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances." + prediction[:warnings] ||= [] + prediction[:warnings] << "R caret model creation error, using weighted average of similar substances (no prediction interval available)." return prediction end begin @@ -72,12 +73,12 @@ module OpenTox $logger.debug "R caret prediction error for:" $logger.debug self.inspect prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances" + prediction[:warnings] << "R caret prediction error, using weighted average of similar substances (no prediction interval available)." return prediction end if prediction.nil? or prediction[:value].nil? prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances." + prediction[:warnings] << "Empty R caret prediction, using weighted average of similar substances (no prediction interval available)." end end prediction diff --git a/lib/classification.rb b/lib/classification.rb index e78783b..638492b 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -21,7 +21,7 @@ module OpenTox probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) - {:value => prediction,:probabilities => probabilities,:warnings => []} + {:value => prediction,:probabilities => probabilities} end end diff --git a/lib/compound.rb b/lib/compound.rb index 8b4bb48..6d0e075 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -230,36 +230,6 @@ module OpenTox update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] self["cid"] end - - def db_neighbors min_sim: 0.2, dataset_id: - #p fingerprints[DEFAULT_FINGERPRINT] - # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb - - #qn = default_fingerprint_size - #qmin = qn * threshold - #qmax = qn / threshold - #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) - #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] - aggregate = [ - #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, - #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self - {'$project' => { - 'similarity' => {'$let' => { - 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, - 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} - }}, - '_id' => 1, - #'measurements' => 1, - 'dataset_ids' => 1 - }}, - {'$match' => {'similarity' => {'$gte' => min_sim}}}, - {'$sort' => {'similarity' => -1}} - ] - - # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array - $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id} - - end # Convert mmol to mg # @return [Float] value in mg diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 4f61ff4..c866ebc 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -24,8 +24,6 @@ module OpenTox ) cv.save # set created_at - nr_instances = 0 - nr_unpredicted = 0 training_dataset = model.training_dataset training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems @@ -33,12 +31,9 @@ module OpenTox t = Time.now validation = TrainTest.create(model, fold[0], fold[1]) cv.validation_ids << validation.id - cv.nr_instances += validation.nr_instances - cv.nr_unpredicted += validation.nr_unpredicted $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" end cv.save - $logger.debug "Nr unpredicted: #{cv.nr_unpredicted}" cv.statistics cv.update_attributes(finished_at: Time.now) cv diff --git a/lib/dataset.rb b/lib/dataset.rb index d02a302..42733e4 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -310,10 +310,6 @@ module OpenTox end all_substances << substance - substance.dataset_ids << self.id - substance.dataset_ids.uniq! - substance.save - add substance, original_id, original_id_value vals.each_with_index do |v,j| @@ -422,6 +418,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n + $logger.debug "Creating #{n} folds for #{name}." len = self.substances.size indices = (0..len-1).to_a.shuffle mid = (len/n) @@ -431,19 +428,15 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i]} + test_substances = test_idxs.collect{|i| substances[i].id} training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i]} + training_substances = training_idxs.collect{|i| substances[i].id} chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) - substances.each do |substance| - substance.dataset_ids << dataset.id - substance.dataset_ids.uniq! - substance.save - dataset.data_entries += data_entries.select{|row| row[0] == substance.id} - end - dataset.save - dataset + self.class.create( + :name => "#{self.name} (Fold #{i-1})", + :source => self.id, + :data_entries => data_entries.select{|row| substances.include? row[0]} + ) end start = last+1 chunks << chunk @@ -468,7 +461,7 @@ module OpenTox if features.first.kind_of? NominalFeature merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(", ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps) else - merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO, :transformations + merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations end else bad_request_error "Cannot merge features of different types (#{feature_classes})." @@ -521,12 +514,6 @@ module OpenTox def transform # TODO end - # Delete dataset - def delete - compounds.each{|c| c.dataset_ids.delete id.to_s} - super - end - end end diff --git a/lib/import.rb~ b/lib/import.rb~ index 0857717..cf1a26f 100644 --- a/lib/import.rb~ +++ b/lib/import.rb~ @@ -47,9 +47,9 @@ module OpenTox :core_id => core_id, :coating_ids => coating_ids ) - np["bundles"].keys.each do |bundle_uri| - nanoparticle.dataset_ids << datasets[bundle_uri].id - end + #np["bundles"].keys.each do |bundle_uri| + #nanoparticle.dataset_ids << datasets[bundle_uri].id + #end studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"] studies.each do |study| diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index b0905b8..d37b6ce 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -16,18 +16,11 @@ module OpenTox loo = klass.new :model_id => model.id predictions = model.predict model.training_dataset.substances predictions.each{|cid,p| p.delete(:neighbors)} - nr_unpredicted = 0 predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 - end + prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value] predictions.delete(cid) unless prediction[:value] and prediction[:measurements] end predictions.select!{|cid,p| p[:value] and p[:measurements]} - loo.nr_instances = predictions.size - loo.nr_unpredicted = nr_unpredicted loo.predictions = predictions loo.statistics $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" diff --git a/lib/model.rb b/lib/model.rb index f50fcd7..a0c60f0 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -38,7 +38,6 @@ module OpenTox def self.create prediction_feature:nil, training_dataset:, algorithms:{} bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature - # TODO: prediction_feature without training_dataset: use all available data # guess model type prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new @@ -198,9 +197,8 @@ module OpenTox # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] - def predict_substance substance, threshold = self.algorithms[:similarity][:min] + def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil - t = Time.now @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features @@ -229,7 +227,7 @@ module OpenTox bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - prediction = {:warnings => [], :measurements => []} + prediction ||= {:warnings => [], :measurements => []} prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min] neighbor_ids = [] neighbor_similarities = [] @@ -240,7 +238,7 @@ module OpenTox substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s - prediction[:measurements] << dependent_variables[i] + prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? @@ -277,17 +275,13 @@ module OpenTox result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} - #if neighbor_similarities.max < algorithms[:similarity][:warn_min] - #prediction[:warnings] << "Closest neighbor has similarity #{neighbor_similarities.max} < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." - #end end if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 prediction else # try again with a lower threshold - predict_substance substance, 0.2 + prediction[:warnings] << "Lowering similarity threshold to 0.2." + predict_substance substance, 0.2, prediction end - #p Time.now - t - prediction end # Predict a substance (compound or nanoparticle), an array of substances or a dataset diff --git a/lib/regression.rb b/lib/regression.rb index 25c0732..fd2855f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -17,7 +17,7 @@ module OpenTox sim_sum += weights[i] end if dependent_variables sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum - {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]} + {:value => prediction} end end diff --git a/lib/substance.rb b/lib/substance.rb index ef49659..5c486d8 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -3,7 +3,6 @@ module OpenTox # Base class for substances (e.g. compunds, nanoparticles) class Substance field :properties, type: Hash, default: {} - field :dataset_ids, type: Array, default: [] end end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index bffee8c..d034cd1 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -18,22 +18,15 @@ module OpenTox validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms validation_model.save predictions = validation_model.predict test_set.substances - nr_unpredicted = 0 predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 - end + prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value] end predictions.select!{|cid,p| p[:value] and p[:measurements]} - # hack to avoid mongos file size limit error on large datasets + # remove neighbors to avoid mongos file size limit error on large datasets predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i) validation = self.new( :model_id => validation_model.id, :test_dataset_id => test_set.id, - :nr_instances => test_set.substances.size, - :nr_unpredicted => nr_unpredicted, :predictions => predictions ) validation.save diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 7bae891..ad4c14d 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -82,6 +82,7 @@ module OpenTox self.weighted_accuracy[a] = (weighted_confusion_matrix[a][0][0]+weighted_confusion_matrix[a][1][1])/confidence_sum[a].to_f end $logger.debug "Accuracy #{accuracy}" + $logger.debug "Nr Predictions #{nr_predictions}" save { :accept_values => accept_values, diff --git a/lib/validation.rb b/lib/validation.rb index c9954b6..9402361 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -10,8 +10,6 @@ module OpenTox store_in collection: "validations" field :name, type: String field :model_id, type: BSON::ObjectId - field :nr_instances, type: Integer, default: 0 - field :nr_unpredicted, type: Integer, default: 0 field :predictions, type: Hash, default: {} field :finished_at, type: Time diff --git a/test/classification-validation.rb b/test/classification-validation.rb index 85db8ba..302b2c8 100644 --- a/test/classification-validation.rb +++ b/test/classification-validation.rb @@ -1,12 +1,13 @@ require_relative "setup.rb" -class ValidationClassificationTest < MiniTest::Test +class ClassificationValidationTest < MiniTest::Test include OpenTox::Validation # defaults def test_default_classification_crossvalidation - dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + #dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" model = Model::Lazar.create training_dataset: dataset cv = ClassificationCrossValidation.create model assert cv.accuracy[:without_warnings] > 0.65, "Accuracy (#{cv.accuracy[:without_warnings]}) should be larger than 0.65, this may occur due to an unfavorable training/test set split" @@ -45,7 +46,6 @@ class ValidationClassificationTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::Lazar.create training_dataset: dataset loo = ClassificationLeaveOneOut.create model - assert_equal 77, loo.nr_unpredicted refute_empty loo.confusion_matrix assert loo.accuracy[:without_warnings] > 0.650 assert loo.weighted_accuracy[:all] > loo.accuracy[:all], "Weighted accuracy (#{loo.weighted_accuracy[:all]}) should be larger than accuracy (#{loo.accuracy[:all]})." diff --git a/test/dataset.rb b/test/dataset.rb index 70d26d2..40aa334 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -23,9 +23,9 @@ class DatasetTest < MiniTest::Test def test_import_pubchem d = Dataset.from_pubchem_aid 1191 - assert_equal 87, d.compounds.size + assert_equal 86, d.compounds.size assert_equal 3, d.features.size - assert_equal ["Active"], d.values(d.compounds[10],d.features[2]) + assert_equal ["Inactive"], d.values(d.compounds[10],d.features[2]) # TODO endpoint name # TODO regression import end diff --git a/test/regression-validation.rb b/test/regression-validation.rb index 44162c0..9418df4 100644 --- a/test/regression-validation.rb +++ b/test/regression-validation.rb @@ -1,6 +1,6 @@ require_relative "setup.rb" -class ValidationRegressionTest < MiniTest::Test +class RegressionValidationTest < MiniTest::Test include OpenTox::Validation # defaults diff --git a/test/use_cases.rb b/test/use_cases.rb index 4e072d8..4842a18 100644 --- a/test/use_cases.rb +++ b/test/use_cases.rb @@ -3,41 +3,50 @@ require_relative "setup.rb" class UseCasesTest < MiniTest::Test def test_PA - #kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" - #hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" - #efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - #datasets = [kazius,hansen,efsa] - #map = {"1" => "mutagen", "0" => "nonmutagen"} + # TODO add assertions + skip "This test ist very time consuming, enable on demand." + kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" + efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" + datasets = [kazius,hansen,efsa] + map = {"1" => "mutagen", "0" => "nonmutagen"} #p "merging" - #training_dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,map,map], keep_original_features: false, remove_duplicates: true - #assert_equal 8281, training_dataset.compounds.size + training_dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,map,map], keep_original_features: false, remove_duplicates: true + assert_equal 8281, training_dataset.compounds.size #p training_dataset.features.size #p training_dataset.id #training_dataset = Dataset.find('5bd8ac8fca62695d767fca6b') #training_dataset = Dataset.find('5bd8bbadca62695f69e7a33b') #puts training_dataset.to_csv - p "create model_validation" - #model_validation = Model::Validation.from_dataset training_dataset: training_dataset, prediction_feature: training_dataset.merged_features.first, species: "Salmonella typhimurium", endpoint: "Mutagenicity" + #p "create model_validation" + model_validation = Model::Validation.from_dataset training_dataset: training_dataset, prediction_feature: training_dataset.merged_features.first, species: "Salmonella typhimurium", endpoint: "Mutagenicity" #p model_validation.id #model_validation = Model::Validation.find '5bd8df47ca6269604590ab38' + #p model_validation.crossvalidations.first.predictions.select{|cid,p| !p["warnings"].empty?} #p "predict" - #pa = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" - #prediction_dataset = model_validation.predict pa + pa = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" + prediction_dataset = model_validation.predict pa #p prediction_dataset.id - prediction_dataset = Dataset.find('5bd98b88ca6269609aab79f4') - puts prediction_dataset.to_csv + #prediction_dataset = Dataset.find('5bd98b88ca6269609aab79f4') + #puts prediction_dataset.to_csv end def test_tox21 + # TODO add assertions + skip "This test ist very time consuming, enable on demand." training_dataset = Dataset.from_pubchem_aid 743122 - p training_dataset.id + #p training_dataset.id #'5bd9a1dbca626969d97fb421' - File.open("AID743122.csv","w+"){|f| f.puts training_dataset.to_csv} - model = Model::Lazar.create training_dataset: training_dataset - p model.id + #File.open("AID743122.csv","w+"){|f| f.puts training_dataset.to_csv} + #model = Model::Lazar.create training_dataset: training_dataset + #p model.id #p Model::Lazar.find('5bd9a70bca626969d97fc9df') model_validation = Model::Validation.from_dataset training_dataset: training_dataset, prediction_feature: training_dataset.bioactivity_features.first, species: "Human HG2L7.5c1 cell line", endpoint: "aryl hydrocarbon receptor (AhR) signaling pathway activation" - p model_validation.id + #model_validation = Model::Validation.find '5bd9b210ca62696be39ab74d' + #model_validation.crossvalidations.each do |cv| + #p cv + #end + #p model_validation.crossvalidations.first.predictions.select{|cid,p| !p["warnings"].empty?} end def test_public_models -- cgit v1.2.3