From 84222bae2bbb9fb3e0ce3e65de1be8e7f94d2147 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 12 Apr 2016 12:37:37 +0200 Subject: new dataset structure --- lib/model.rb | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 8e657b8..1f9942b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -227,6 +227,45 @@ module OpenTox end end + class NanoLazar + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :name, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + field :training_particle_ids, type: Array + + def self.create_all + nanoparticles = Nanoparticle.all + toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} + tox = {} + toxfeatures.each do |t| + tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} + end + tox.select!{|t,nps| nps.size > 50} + tox.collect do |t,nps| + find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) + end + end + + def predict nanoparticle + training = training_particle_ids.collect{|id| Nanoparticle.find id} + training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq + query_features = nanoparticle.physchem_descriptors.keys + common_features = (training_features & query_features) + p common_features + end + + end + end end -- cgit v1.2.3 From a8368dda776c05331474adf7eaf9a6e413a3b1eb Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 15:15:51 +0200 Subject: validation tests pass --- lib/model.rb | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 1f9942b..5140d5a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -90,33 +90,36 @@ module OpenTox end # make predictions - predictions = [] - predictions = compounds.collect{|c| predict_compound c} + predictions = {} + compounds.each do |c| + predictions[c.id.to_s] = predict_compound c + predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id + end # serialize result case object.class.to_s when "OpenTox::Compound" - prediction = predictions.first + prediction = predictions[compounds.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity - return prediction + return predictions when "Array" return predictions when "OpenTox::Dataset" + predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id - prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) + prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) prediction_dataset = LazarPrediction.new( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, :prediction_feature_id => prediction_feature.id ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" ) - warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] - prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]} + + compounds.each_with_index do |c,i| + prediction_dataset.predictions[c.id.to_s] = predictions[i] + end prediction_dataset.save return prediction_dataset end -- cgit v1.2.3 From 753fcc204d93d86c76860bee6e2f7d0468c3c940 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 14 Apr 2016 19:43:24 +0200 Subject: features/toxicities fixed --- lib/model.rb | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 5140d5a..1960c10 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -36,6 +36,7 @@ module OpenTox super params # TODO document convention + #p training_dataset.features prediction_feature = training_dataset.features.first # set defaults for empty parameters self.prediction_feature_id ||= prediction_feature.id @@ -56,12 +57,13 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq + #TODO restrict to dataset features + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } + neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -78,12 +80,11 @@ module OpenTox # parse data compounds = [] - case object.class.to_s - when "OpenTox::Compound" + if object.is_a? Substance compounds = [object] - when "Array" + elsif object.is_a? Array compounds = object - when "OpenTox::Dataset" + elsif object.is_a? Dataset compounds = object.compounds else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." @@ -97,30 +98,26 @@ module OpenTox end # serialize result - case object.class.to_s - when "OpenTox::Compound" + if object.is_a? Substance prediction = predictions[compounds.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity + return prediction + elsif object.is_a? Array return predictions - when "Array" - return predictions - when "OpenTox::Dataset" + elsif object.is_a? Dataset predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.new( + prediction_dataset = LazarPrediction.create( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, - :prediction_feature_id => prediction_feature.id - + :prediction_feature_id => prediction_feature.id, + :predictions => predictions ) - compounds.each_with_index do |c,i| - prediction_dataset.predictions[c.id.to_s] = predictions[i] - end - prediction_dataset.save + #prediction_dataset.save return prediction_dataset end @@ -264,7 +261,7 @@ module OpenTox training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq query_features = nanoparticle.physchem_descriptors.keys common_features = (training_features & query_features) - p common_features + #p common_features end end -- cgit v1.2.3 From 8aab046eb1ad39aaf10c5a8596102c35c7b2ee0b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 15 Apr 2016 11:01:16 +0200 Subject: data_entries removed from datasets. datasets are now just containers for compounds and features, feature values have to be retrieved from substances. --- lib/model.rb | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 1960c10..b82f098 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -20,6 +20,10 @@ module OpenTox def training_dataset Dataset.find(training_dataset_id) end + + def prediction_feature + Feature.find(prediction_feature_id) + end end class Lazar < Model @@ -31,13 +35,10 @@ module OpenTox # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model - def initialize training_dataset, params={} + def initialize prediction_feature, training_dataset, params={} super params - # TODO document convention - #p training_dataset.features - prediction_feature = training_dataset.features.first # set defaults for empty parameters self.prediction_feature_id ||= prediction_feature.id self.training_dataset_id ||= training_dataset.id @@ -49,7 +50,6 @@ module OpenTox end def predict_compound compound - prediction_feature = Feature.find prediction_feature_id neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) @@ -122,18 +122,13 @@ module OpenTox end end - - def training_activities - i = training_dataset.feature_ids.index prediction_feature_id - training_dataset.data_entries.collect{|de| de[i]} - end end class LazarClassification < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm model.neighbor_algorithm ||= "fingerprint_neighbors" model.neighbor_algorithm_parameters ||= {} @@ -151,8 +146,8 @@ module OpenTox class LazarRegression < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.neighbor_algorithm ||= "fingerprint_neighbors" model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} @@ -173,13 +168,13 @@ module OpenTox include Mongoid::Document include Mongoid::Timestamps - # TODO field Validations field :endpoint, type: String field :species, type: String field :source, type: String field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId + field :leave_one_out_validation_id, type: BSON::ObjectId def predict object Lazar.find(model_id).predict object @@ -201,12 +196,16 @@ module OpenTox repeated_crossvalidation.crossvalidations end + def leave_one_out_validation + LeaveOneOutValidation.find leave_one_out_validation_id + end + def regression? - training_dataset.features.first.numeric? + model.is_a? LazarRegression end def classification? - training_dataset.features.first.nominal? + model.is_a? LazarClassification end def self.from_csv_file file @@ -214,14 +213,17 @@ module OpenTox bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file prediction_model = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file + prediction_feature = training_dataset.features.first model = nil - if training_dataset.features.first.nominal? - model = LazarClassification.create training_dataset - elsif training_dataset.features.first.numeric? - model = LazarRegression.create training_dataset + if prediction_feature.nominal? + model = LazarClassification.create prediction_feature, training_dataset + elsif prediction_feature.numeric? + model = LazarRegression.create prediction_feature, training_dataset end prediction_model[:model_id] = model.id + prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id + prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id prediction_model.save prediction_model end -- cgit v1.2.3 From cfc64a2966ab38698e499f0b44f41208ee77a07f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 26 Apr 2016 17:38:15 +0200 Subject: first nanomaterial prediction --- lib/model.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index b82f098..45054e2 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -50,6 +50,7 @@ module OpenTox end def predict_compound compound + #p compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) -- cgit v1.2.3 From acf19c81e345ceccde834653a0f0edce27827958 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 11:05:05 +0200 Subject: compound classification fixed --- lib/model.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 45054e2..80b4685 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -64,7 +64,7 @@ module OpenTox prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } + #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else -- cgit v1.2.3 From 79238bddb59607aa9f759caa9e3c8db176709703 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 12:19:48 +0200 Subject: compound validations fixed --- lib/model.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 80b4685..f61368e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -50,7 +50,6 @@ module OpenTox end def predict_compound compound - #p compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) -- cgit v1.2.3 From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 May 2016 19:24:42 +0200 Subject: first reasonable results for nanoparticle crossvalidation --- lib/model.rb | 101 ++++++++++++++++++++++++++--------------------------------- 1 file changed, 44 insertions(+), 57 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index f61368e..841ab20 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -2,7 +2,7 @@ module OpenTox module Model - class Model + class Lazar include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -10,27 +10,13 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ - # datasets field :training_dataset_id, type: BSON::ObjectId - # algorithms field :prediction_algorithm, type: String - # prediction feature field :prediction_feature_id, type: BSON::ObjectId - - def training_dataset - Dataset.find(training_dataset_id) - end - - def prediction_feature - Feature.find(prediction_feature_id) - end - end - - class Lazar < Model - - # algorithms field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash, default: {} + field :feature_selection_algorithm, type: String + field :relevant_features, type: Hash # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset @@ -45,10 +31,43 @@ module OpenTox self.name ||= "#{training_dataset.name} #{prediction_feature.name}" self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + + Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm save self end + def correlation_filter + toxicities = [] + substances = [] + training_dataset.substances.each do |s| + s["toxicities"][prediction_feature_id].each do |act| + toxicities << act + substances << s + end + end + R.assign "tox", toxicities + feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + R.assign "feature", feature_values + begin + #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature] = {} + relevant_features[feature]["pvalue"] = pvalue + relevant_features[feature]["r"] = r + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + end + end + relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + end + def predict_compound compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature @@ -63,7 +82,6 @@ module OpenTox prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -123,6 +141,14 @@ module OpenTox end + def training_dataset + Dataset.find(training_dataset_id) + end + + def prediction_feature + Feature.find(prediction_feature_id) + end + end class LazarClassification < Lazar @@ -229,45 +255,6 @@ module OpenTox end end - class NanoLazar - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "models" - - field :name, type: String - field :creator, type: String, default: __FILE__ - # datasets - field :training_dataset_id, type: BSON::ObjectId - # algorithms - field :prediction_algorithm, type: String - # prediction feature - field :prediction_feature_id, type: BSON::ObjectId - field :training_particle_ids, type: Array - - def self.create_all - nanoparticles = Nanoparticle.all - toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} - tox = {} - toxfeatures.each do |t| - tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} - end - tox.select!{|t,nps| nps.size > 50} - tox.collect do |t,nps| - find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) - end - end - - def predict nanoparticle - training = training_particle_ids.collect{|id| Nanoparticle.find id} - training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq - query_features = nanoparticle.physchem_descriptors.keys - common_features = (training_features & query_features) - #p common_features - end - - end - end end -- cgit v1.2.3 From ab7b37541b4f8a762be737009631d3eefd898b4a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 5 May 2016 16:14:02 +0200 Subject: ambit mirror, import from mirrored json, proteomics import --- lib/model.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 841ab20..12abc6e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -47,9 +47,9 @@ module OpenTox end end R.assign "tox", toxicities - feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + feature_values = substances.collect{|s| s["physchem"][feature_id]} R.assign "feature", feature_values begin #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" -- cgit v1.2.3 From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 6 May 2016 12:49:28 +0200 Subject: dataset tests cleanup --- lib/model.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 12abc6e..841ab20 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -47,9 +47,9 @@ module OpenTox end end R.assign "tox", toxicities - feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq + feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem"][feature_id]} + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} R.assign "feature", feature_values begin #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" -- cgit v1.2.3 From 06fc914653face2c58fd4e6c47161cb03e217582 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 8 May 2016 12:22:58 +0200 Subject: default validations fixed --- lib/model.rb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 841ab20..5b094fb 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -41,7 +41,7 @@ module OpenTox toxicities = [] substances = [] training_dataset.substances.each do |s| - s["toxicities"][prediction_feature_id].each do |act| + s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act| toxicities << act substances << s end @@ -76,8 +76,7 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - #TODO restrict to dataset features - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} -- cgit v1.2.3 From 611bac891177f8d9185d45486dd574b6ef4d1912 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 15:11:46 +0200 Subject: nanoparticle models fixed --- lib/model.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 5b094fb..070248a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -76,6 +76,7 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id + me = neighbors.select{|n| n["_id"] == compound.id}.first database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." -- cgit v1.2.3 From b8bb12c8a163c238d7d4387c1914e2100bb660df Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 12 May 2016 15:23:01 +0200 Subject: enm study import fixed --- lib/model.rb | 65 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 070248a..8baed41 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -30,7 +30,7 @@ module OpenTox self.training_dataset_id ||= training_dataset.id self.name ||= "#{training_dataset.name} #{prediction_feature.name}" self.neighbor_algorithm_parameters ||= {} - self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm save @@ -41,7 +41,7 @@ module OpenTox toxicities = [] substances = [] training_dataset.substances.each do |s| - s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act| + training_dataset.values(s,prediction_feature_id).each do |act| toxicities << act substances << s end @@ -68,24 +68,41 @@ module OpenTox relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h end - def predict_compound compound - neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) - # remove neighbors without prediction_feature - # check for database activities (neighbors may include query compound) + def predict_substance substance + neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) database_activities = nil prediction = {} - if neighbors.collect{|n| n["_id"]}.include? compound.id + # handle query substance + if neighbors.collect{|n| n["_id"]}.include? substance.id - me = neighbors.select{|n| n["_id"] == compound.id}.first - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq + query = neighbors.select{|n| n["_id"] == substance.id}.first + database_activities = training_dataset.values(query["_id"],prediction_feature_id) prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." - neighbors.delete_if{|n| n["_id"] == compound.id} + prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance." + neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? - prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) + prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbors.size == 1 + value = nil + tox = neighbors.first["toxicities"] + if tox.size == 1 # single measurement + value = tox + else # multiple measurement + if tox.collect{|t| t.numeric?}.uniq == [true] # numeric + value = tox.median + elsif tox.uniq.size == 1 # single value + value = tox.first + else # contradictory results + # TODO add majority vote + end + end + prediction.merge!({:value => value, :confidence => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values."}) if value else - prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id})) + # call prediction algorithm + klass,method = prediction_algorithm.split('.') + result = Object.const_get(klass).send(method,substance,neighbors) + prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] end @@ -97,27 +114,27 @@ module OpenTox training_dataset = Dataset.find training_dataset_id # parse data - compounds = [] + substances = [] if object.is_a? Substance - compounds = [object] + substances = [object] elsif object.is_a? Array - compounds = object + substances = object elsif object.is_a? Dataset - compounds = object.compounds + substances = object.substances else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." end # make predictions predictions = {} - compounds.each do |c| - predictions[c.id.to_s] = predict_compound c + substances.each do |c| + predictions[c.id.to_s] = predict_substance c predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end # serialize result if object.is_a? Substance - prediction = predictions[compounds.first.id.to_s] + prediction = predictions[substances.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity return prediction elsif object.is_a? Array @@ -160,7 +177,8 @@ module OpenTox model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", - :training_dataset_id => training_dataset.id, + :dataset_id => training_dataset.id, + :prediction_feature_id => prediction_feature.id, :min_sim => 0.1 }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value @@ -179,8 +197,9 @@ module OpenTox model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", - :training_dataset_id => training_dataset.id, - :min_sim => 0.1 + :min_sim => 0.1, + :dataset_id => training_dataset.id, + :prediction_feature_id => prediction_feature.id, }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end -- cgit v1.2.3 From cc08e6beda7f7d70ebf6c6929a22d1a0cd7c1a20 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 15:41:24 +0200 Subject: tests fixed. DescriptorTest#test_compound_all may fail within all.rb --- lib/model.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 8baed41..3a178a1 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -69,6 +69,7 @@ module OpenTox end def predict_substance substance + neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) database_activities = nil prediction = {} @@ -82,22 +83,22 @@ module OpenTox neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? - prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil tox = neighbors.first["toxicities"] if tox.size == 1 # single measurement - value = tox + value = tox.first else # multiple measurement if tox.collect{|t| t.numeric?}.uniq == [true] # numeric value = tox.median elsif tox.uniq.size == 1 # single value value = tox.first else # contradictory results - # TODO add majority vote + # TODO add majority vote?? end end - prediction.merge!({:value => value, :confidence => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values."}) if value + prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value else # call prediction algorithm klass,method = prediction_algorithm.split('.') -- cgit v1.2.3 From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 27 May 2016 19:16:16 +0200 Subject: first correlation of nanoparticle predictions --- lib/model.rb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 3a178a1..18d621b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -32,12 +32,13 @@ module OpenTox self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id - Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm + #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm save self end def correlation_filter + self.relevant_features = {} toxicities = [] substances = [] training_dataset.substances.each do |s| @@ -49,23 +50,22 @@ module OpenTox R.assign "tox", toxicities feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} R.assign "feature", feature_values begin - #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" pvalue = R.eval("cor$p.value").to_ruby if pvalue <= 0.05 r = R.eval("cor$estimate").to_ruby - relevant_features[feature] = {} - relevant_features[feature]["pvalue"] = pvalue - relevant_features[feature]["r"] = r + self.relevant_features[feature_id] = {} + self.relevant_features[feature_id]["pvalue"] = pvalue + self.relevant_features[feature_id]["r"] = r end rescue warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." end end - relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h end def predict_substance substance -- cgit v1.2.3 From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 31 May 2016 18:08:08 +0200 Subject: cleanup of validation modules/classes --- lib/model.rb | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 18d621b..988cac9 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -22,7 +22,6 @@ module OpenTox # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model def initialize prediction_feature, training_dataset, params={} - super params # set defaults for empty parameters @@ -39,15 +38,15 @@ module OpenTox def correlation_filter self.relevant_features = {} - toxicities = [] + measurements = [] substances = [] training_dataset.substances.each do |s| training_dataset.values(s,prediction_feature_id).each do |act| - toxicities << act + measurements << act substances << s end end - R.assign "tox", toxicities + R.assign "tox", measurements feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} @@ -62,7 +61,7 @@ module OpenTox self.relevant_features[feature_id]["r"] = r end rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h @@ -71,22 +70,22 @@ module OpenTox def predict_substance substance neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) - database_activities = nil + measurements = nil prediction = {} # handle query substance if neighbors.collect{|n| n["_id"]}.include? substance.id query = neighbors.select{|n| n["_id"] == substance.id}.first - database_activities = training_dataset.values(query["_id"],prediction_feature_id) - prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance." + measurements = training_dataset.values(query["_id"],prediction_feature_id) + prediction[:measurements] = measurements + prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance." neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil - tox = neighbors.first["toxicities"] + tox = neighbors.first["measurements"] if tox.size == 1 # single measurement value = tox.first else # multiple measurement @@ -141,7 +140,7 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - predictions.each{|cid,p| p.delete(:neighbors)} + #predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id @@ -187,6 +186,7 @@ module OpenTox model.save model end + end class LazarRegression < Lazar @@ -197,19 +197,21 @@ module OpenTox model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} { - :type => "MP2D", :min_sim => 0.1, :dataset_id => training_dataset.id, :prediction_feature_id => prediction_feature.id, }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end + model.neighbor_algorithm_parameters[:type] = "MP2D" if training_dataset.substances.first.is_a? Compound model.save model end + end class Prediction + include OpenTox include Mongoid::Document include Mongoid::Timestamps -- cgit v1.2.3 From 65b69d4c35890a7a2d2992108f0cf4eb5202dd1b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 1 Jun 2016 10:37:00 +0200 Subject: validation tests fixed --- lib/model.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 988cac9..81f9629 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -33,7 +33,6 @@ module OpenTox #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm save - self end def correlation_filter @@ -203,7 +202,7 @@ module OpenTox }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end - model.neighbor_algorithm_parameters[:type] = "MP2D" if training_dataset.substances.first.is_a? Compound + model.neighbor_algorithm_parameters[:type] ||= "MP2D" if training_dataset.substances.first.is_a? Compound model.save model end -- cgit v1.2.3 From 458a2d753551ea607f2ed5efdd0ac0a02d55d673 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 1 Jun 2016 12:46:03 +0200 Subject: all tests fixed --- lib/model.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 81f9629..3482aee 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -236,7 +236,7 @@ module OpenTox end def repeated_crossvalidation - RepeatedCrossValidation.find repeated_crossvalidation_id + Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end def crossvalidations @@ -244,7 +244,7 @@ module OpenTox end def leave_one_out_validation - LeaveOneOutValidation.find leave_one_out_validation_id + Validation::LeaveOneOut.find leave_one_out_validation_id end def regression? @@ -269,8 +269,8 @@ module OpenTox end prediction_model[:model_id] = model.id prediction_model[:prediction_feature_id] = prediction_feature.id - prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id - prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id + prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save prediction_model end -- cgit v1.2.3 From 128fd36b2531756c15a93776871e80eb44e524f1 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 19:01:18 +0200 Subject: proteomics regression validation --- lib/model.rb | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 3482aee..277bca3 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -31,7 +31,7 @@ module OpenTox self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id - #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm + send(feature_selection_algorithm.to_sym) if feature_selection_algorithm save end @@ -49,25 +49,31 @@ module OpenTox feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - self.relevant_features[feature_id] = {} - self.relevant_features[feature_id]["pvalue"] = pvalue - self.relevant_features[feature_id]["r"] = r + unless feature_values.uniq.size == 1 + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + self.relevant_features[feature_id] = {} + self.relevant_features[feature_id]["pvalue"] = pvalue + self.relevant_features[feature_id]["r"] = r + self.relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + self.relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + p self.relevant_features end def predict_substance substance neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols + neighbor_algorithm_parameters[:relevant_features] = self.relevant_features if self.relevant_features neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) measurements = nil prediction = {} -- cgit v1.2.3 From 290c7f86950c4051d018b8019ff4e72ec406c58c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 3 Jun 2016 19:15:36 +0200 Subject: random forest regression --- lib/model.rb | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 277bca3..0432c56 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -3,6 +3,7 @@ module OpenTox module Model class Lazar + include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -11,11 +12,15 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ field :training_dataset_id, type: BSON::ObjectId - field :prediction_algorithm, type: String field :prediction_feature_id, type: BSON::ObjectId + + field :prediction_algorithm, type: String + field :prediction_algorithm_parameters, type: Hash, default: {} + field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash, default: {} field :feature_selection_algorithm, type: String + field :feature_selection_algorithm_parameters, type: Hash, default: {} field :relevant_features, type: Hash # Create a lazar model from a training_dataset and a feature_dataset @@ -35,7 +40,8 @@ module OpenTox save end - def correlation_filter + def correlation_filter + # TODO: speedup, single assignment of all features to R+ parallel computation of significance? self.relevant_features = {} measurements = [] substances = [] @@ -47,6 +53,7 @@ module OpenTox end R.assign "tox", measurements feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category] feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} unless feature_values.uniq.size == 1 @@ -68,7 +75,6 @@ module OpenTox end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h - p self.relevant_features end def predict_substance substance @@ -90,14 +96,14 @@ module OpenTox prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil - tox = neighbors.first["measurements"] - if tox.size == 1 # single measurement - value = tox.first + m = neighbors.first["measurements"] + if m.size == 1 # single measurement + value = m.first else # multiple measurement - if tox.collect{|t| t.numeric?}.uniq == [true] # numeric - value = tox.median - elsif tox.uniq.size == 1 # single value - value = tox.first + if m.collect{|t| t.numeric?}.uniq == [true] # numeric + value = m.median + elsif m.uniq.size == 1 # single value + value = m.first else # contradictory results # TODO add majority vote?? end @@ -106,7 +112,8 @@ module OpenTox else # call prediction algorithm klass,method = prediction_algorithm.split('.') - result = Object.const_get(klass).send(method,substance,neighbors) + params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors}) + result = Object.const_get(klass).send(method,params) prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] -- cgit v1.2.3 From 7313c5d26b5f3a672dac0494f16cdf0185f6a39f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 26 Jul 2016 13:21:57 +0200 Subject: NanoPrediction model --- lib/model.rb | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 0432c56..5cf2cdb 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -283,10 +283,45 @@ module OpenTox prediction_model[:model_id] = model.id prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save prediction_model end + + end + + class NanoPrediction < Prediction + + def self.from_json_dump dir, category + Import::Enanomapper.import dir + + prediction_model = self.new( + :endpoint => "log2(Net cell association)", + :source => "https://data.enanomapper.net/", + :species => "A549 human lung epithelial carcinoma cells", + :unit => "log2(ug/Mg)" + ) + params = { + :feature_selection_algorithm => :correlation_filter, + :feature_selection_algorithm_parameters => {:category => category}, + :neighbor_algorithm => "physchem_neighbors", + :neighbor_algorithm_parameters => {:min_sim => 0.5}, + :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", + :prediction_algorithm_parameters => {:method => 'rf'}, # random forests + } + training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + prediction_feature = Feature.find_or_create_by(name: "log2(Net cell association)", category: "TOX") + #prediction_feature = Feature.find("579621b84de73e267b414e55") + prediction_model[:prediction_feature_id] = prediction_feature.id + model = Model::LazarRegression.create(prediction_feature, training_dataset, params) + prediction_model[:model_id] = model.id + repeated_cv = Validation::RepeatedCrossValidation.create model + prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + prediction_model.save + prediction_model + end + end end -- cgit v1.2.3 From adefea0e78a4f05a2c9537e643873ad61fc22a0a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 3 Oct 2016 19:49:55 +0200 Subject: initial model creation tests --- lib/model.rb | 120 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 64 insertions(+), 56 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 5cf2cdb..749611e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -13,31 +13,73 @@ module OpenTox field :creator, type: String, default: __FILE__ field :training_dataset_id, type: BSON::ObjectId field :prediction_feature_id, type: BSON::ObjectId - - field :prediction_algorithm, type: String - field :prediction_algorithm_parameters, type: Hash, default: {} - - field :neighbor_algorithm, type: String - field :neighbor_algorithm_parameters, type: Hash, default: {} - field :feature_selection_algorithm, type: String - field :feature_selection_algorithm_parameters, type: Hash, default: {} + field :algorithms, type: Hash field :relevant_features, type: Hash - - # Create a lazar model from a training_dataset and a feature_dataset - # @param [OpenTox::Dataset] training_dataset - # @return [OpenTox::Model::Lazar] Regression or classification model - def initialize prediction_feature, training_dataset, params={} - super params + + def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} + bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset + prediction_feature = training_dataset.features.first unless prediction_feature + # TODO: prediction_feature without training_dataset: use all available data + # explicit prediction algorithm + if algorithms[:prediction] and algorithms[:prediction][:method] + case algorithms[:prediction][:method] + when /Classifiction/ + model = LazarClassification.new + when /Regression/ + model = LazarRegression.new + end + # guess model type + elsif prediction_feature.numeric? + model = LazarRegression.new + else + model = LazarClassification.new + end + # set defaults + if model.class == LazarClassification + model.algorithms = { + :similarity => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Classification.weighted_majority_vote", + }, + :feature_selection => nil, + } + elsif model.class == LazarRegression + model.algorithms = { + :similarity => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Regression.local_caret", + :parameters => "pls", + }, + :feature_selection => nil, + } + end + + # overwrite defaults + algorithms.each do |type,parameters| + parameters.each do |p,v| + model.algorithms[type][p] = v + end if parameters + end # set defaults for empty parameters - self.prediction_feature_id ||= prediction_feature.id - self.training_dataset_id ||= training_dataset.id - self.name ||= "#{training_dataset.name} #{prediction_feature.name}" - self.neighbor_algorithm_parameters ||= {} - self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id - - send(feature_selection_algorithm.to_sym) if feature_selection_algorithm - save + model.prediction_feature_id = prediction_feature.id + model.training_dataset_id = training_dataset.id + model.name = "#{training_dataset.name} #{prediction_feature.name}" + + #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm + model.save + p model + model end def correlation_filter @@ -181,45 +223,11 @@ module OpenTox end class LazarClassification < Lazar - - def self.create prediction_feature, training_dataset, params={} - model = self.new prediction_feature, training_dataset, params - model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm - model.neighbor_algorithm ||= "fingerprint_neighbors" - model.neighbor_algorithm_parameters ||= {} - { - :type => "MP2D", - :dataset_id => training_dataset.id, - :prediction_feature_id => prediction_feature.id, - :min_sim => 0.1 - }.each do |key,value| - model.neighbor_algorithm_parameters[key] ||= value - end - model.save - model - end end class LazarRegression < Lazar - def self.create prediction_feature, training_dataset, params={} - model = self.new prediction_feature, training_dataset, params - model.neighbor_algorithm ||= "fingerprint_neighbors" - model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" - model.neighbor_algorithm_parameters ||= {} - { - :min_sim => 0.1, - :dataset_id => training_dataset.id, - :prediction_feature_id => prediction_feature.id, - }.each do |key,value| - model.neighbor_algorithm_parameters[key] ||= value - end - model.neighbor_algorithm_parameters[:type] ||= "MP2D" if training_dataset.substances.first.is_a? Compound - model.save - model - end - end class Prediction -- cgit v1.2.3 From 5d4e5e463c2b87241bbb56e4658e1e26c0ed084f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 5 Oct 2016 13:22:12 +0200 Subject: substance and nanoparticle model creation and predictions --- lib/model.rb | 135 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 65 insertions(+), 70 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 749611e..a272580 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,101 +28,91 @@ module OpenTox when /Regression/ model = LazarRegression.new end + # guess model type elsif prediction_feature.numeric? model = LazarRegression.new else model = LazarClassification.new end + # set defaults - if model.class == LazarClassification + substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq + bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 + + if substance_classes.first == "OpenTox::Compound" + model.algorithms = { + :descriptors => { + :method => "fingerprint", + :type => 'MP2D', + }, :similarity => { - :descriptors => "fingerprint['MP2D']", :method => "Algorithm::Similarity.tanimoto", :min => 0.1 }, - :prediction => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Classification.weighted_majority_vote", - }, - :feature_selection => nil, + :feature_selection => nil } - elsif model.class == LazarRegression + + if model.class == LazarClassification + model.algorithms[:prediction] = { + :method => "Algorithm::Classification.weighted_majority_vote", + } + elsif model.class == LazarRegression + model.algorithms[:prediction] = { + :method => "Algorithm::Regression.caret", + :parameters => "pls", + } + end + + elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { + :descriptors => { + :method => "properties", + #:types => ["P-CHEM","Proteomics"], + :types => ["P-CHEM"], + }, :similarity => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 }, :prediction => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Regression.local_caret", - :parameters => "pls", + :method => "Algorithm::Regression.caret", + :parameters => "rf", + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", }, - :feature_selection => nil, } + else + bad_request_error "Cannot create models for #{substance_classes.first}." end - # overwrite defaults + # overwrite defaults with explicit parameters algorithms.each do |type,parameters| - parameters.each do |p,v| - model.algorithms[type][p] = v - end if parameters + if parameters and parameters.is_a? Hash + parameters.each do |p,v| + model.algorithms[type] ||= {} + model.algorithms[type][p] = v + end + else + model.algorithms[type] = parameters + end end - # set defaults for empty parameters model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = "#{training_dataset.name} #{prediction_feature.name}" - #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] + model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] + end model.save - p model model end - def correlation_filter - # TODO: speedup, single assignment of all features to R+ parallel computation of significance? - self.relevant_features = {} - measurements = [] - substances = [] - training_dataset.substances.each do |s| - training_dataset.values(s,prediction_feature_id).each do |act| - measurements << act - substances << s - end - end - R.assign "tox", measurements - feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq - feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category] - feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - unless feature_values.uniq.size == 1 - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - self.relevant_features[feature_id] = {} - self.relevant_features[feature_id]["pvalue"] = pvalue - self.relevant_features[feature_id]["r"] = r - self.relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - self.relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby - end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." - end - end - end - self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h - end - def predict_substance substance - neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols - neighbor_algorithm_parameters[:relevant_features] = self.relevant_features if self.relevant_features - neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) + neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features measurements = nil prediction = {} # handle query substance @@ -153,9 +143,17 @@ module OpenTox prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value else # call prediction algorithm - klass,method = prediction_algorithm.split('.') - params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors}) - result = Object.const_get(klass).send(method,params) + case algorithms[:descriptors][:method] + when "fingerprint" + descriptors = substance.fingerprints[algorithms[:descriptors][:type]] + when "properties" + descriptors = substance.properties + else + bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." + end + params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors}) + params.delete :method + result = Algorithm.run algorithms[:prediction][:method], params prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] @@ -176,7 +174,7 @@ module OpenTox elsif object.is_a? Dataset substances = object.substances else - bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions @@ -194,7 +192,6 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - #predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id @@ -205,8 +202,6 @@ module OpenTox :prediction_feature_id => prediction_feature.id, :predictions => predictions ) - - #prediction_dataset.save return prediction_dataset end @@ -314,7 +309,7 @@ module OpenTox :feature_selection_algorithm_parameters => {:category => category}, :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}, - :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", + :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression", :prediction_algorithm_parameters => {:method => 'rf'}, # random forests } training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") -- cgit v1.2.3 From 4348eec89033e6677c9f628646fc67bd03c73fe6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 6 Oct 2016 19:14:10 +0200 Subject: nano caret regression fixed --- lib/model.rb | 64 +++++++++++++++++++++++++++--------------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index a272580..290309a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -23,10 +23,12 @@ module OpenTox # explicit prediction algorithm if algorithms[:prediction] and algorithms[:prediction][:method] case algorithms[:prediction][:method] - when /Classifiction/ + when /Classification/i model = LazarClassification.new - when /Regression/ + when /Regression/i model = LazarRegression.new + else + bad_request_error "Prediction method '#{algorithms[:prediction][:method]}' not implemented." end # guess model type @@ -36,6 +38,10 @@ module OpenTox model = LazarClassification.new end + model.prediction_feature_id = prediction_feature.id + model.training_dataset_id = training_dataset.id + model.name = "#{training_dataset.name} #{prediction_feature.name}" + # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 @@ -60,7 +66,7 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Regression.caret", + :method => "Algorithm::Caret.regression", :parameters => "pls", } end @@ -77,7 +83,7 @@ module OpenTox :min => 0.5 }, :prediction => { - :method => "Algorithm::Regression.caret", + :method => "Algorithm::Caret.regression", :parameters => "rf", }, :feature_selection => { @@ -100,10 +106,6 @@ module OpenTox end end - model.prediction_feature_id = prediction_feature.id - model.training_dataset_id = training_dataset.id - model.name = "#{training_dataset.name} #{prediction_feature.name}" - if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] end @@ -151,8 +153,12 @@ module OpenTox else bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." end - params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors}) - params.delete :method + params = { + :method => algorithms[:prediction][:parameters], + :descriptors => descriptors, + :neighbors => neighbors, + :relevant_features => relevant_features + } result = Algorithm.run algorithms[:prediction][:method], params prediction.merge! result prediction[:neighbors] = neighbors @@ -218,11 +224,9 @@ module OpenTox end class LazarClassification < Lazar - end class LazarRegression < Lazar - end class Prediction @@ -240,7 +244,7 @@ module OpenTox field :leave_one_out_validation_id, type: BSON::ObjectId def predict object - Lazar.find(model_id).predict object + model.predict object end def training_dataset @@ -251,6 +255,10 @@ module OpenTox Lazar.find model_id end + def prediction_feature + model.prediction_feature + end + def repeated_crossvalidation Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end @@ -276,15 +284,8 @@ module OpenTox bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file prediction_model = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file - prediction_feature = training_dataset.features.first - model = nil - if prediction_feature.nominal? - model = LazarClassification.create prediction_feature, training_dataset - elsif prediction_feature.numeric? - model = LazarRegression.create prediction_feature, training_dataset - end + model = Lazar.create training_dataset: training_dataset prediction_model[:model_id] = model.id - prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save @@ -297,26 +298,19 @@ module OpenTox def self.from_json_dump dir, category Import::Enanomapper.import dir - + training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + end prediction_model = self.new( :endpoint => "log2(Net cell association)", :source => "https://data.enanomapper.net/", :species => "A549 human lung epithelial carcinoma cells", :unit => "log2(ug/Mg)" ) - params = { - :feature_selection_algorithm => :correlation_filter, - :feature_selection_algorithm_parameters => {:category => category}, - :neighbor_algorithm => "physchem_neighbors", - :neighbor_algorithm_parameters => {:min_sim => 0.5}, - :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression", - :prediction_algorithm_parameters => {:method => 'rf'}, # random forests - } - training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - prediction_feature = Feature.find_or_create_by(name: "log2(Net cell association)", category: "TOX") - #prediction_feature = Feature.find("579621b84de73e267b414e55") - prediction_model[:prediction_feature_id] = prediction_feature.id - model = Model::LazarRegression.create(prediction_feature, training_dataset, params) + prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first + model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id -- cgit v1.2.3 From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 12 Oct 2016 21:32:27 +0200 Subject: physchem regression, correlation_filter for fingerprints --- lib/model.rb | 197 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 137 insertions(+), 60 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 290309a..f3f0603 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -11,10 +11,18 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ + field :algorithms, type: Hash, default:{} field :training_dataset_id, type: BSON::ObjectId + field :substance_ids, type: Array, default:[] field :prediction_feature_id, type: BSON::ObjectId - field :algorithms, type: Hash - field :relevant_features, type: Hash + field :dependent_variables, type: Array, default:[] + field :descriptor_ids, type:Array, default:[] + field :independent_variables, type: Array, default:[] + field :fingerprints, type: Array, default:[] + field :descriptor_weights, type: Array, default:[] + field :descriptor_means, type: Array, default:[] + field :descriptor_sds, type: Array, default:[] + field :scaled_variables, type: Array, default:[] def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset @@ -40,7 +48,7 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id - model.name = "#{training_dataset.name} #{prediction_feature.name}" + model.name = "#{prediction_feature.name} (#{training_dataset.name})" # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq @@ -49,10 +57,7 @@ module OpenTox if substance_classes.first == "OpenTox::Compound" model.algorithms = { - :descriptors => { - :method => "fingerprint", - :type => 'MP2D', - }, + :descriptors => ['MP2D'], :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -66,25 +71,20 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Caret.regression", - :parameters => "pls", + :method => "Algorithm::Caret.pls", } end elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { - :descriptors => { - :method => "properties", - #:types => ["P-CHEM","Proteomics"], - :types => ["P-CHEM"], - }, + :descriptors => ["P-CHEM"], + #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5 }, :prediction => { - :method => "Algorithm::Caret.regression", - :parameters => "rf", + :method => "Algorithm::Caret.rf", }, :feature_selection => { :method => "Algorithm::FeatureSelection.correlation_filter", @@ -106,63 +106,128 @@ module OpenTox end end + # parse dependent_variables from training dataset + training_dataset.substances.each do |substance| + values = training_dataset.values(substance,model.prediction_feature_id) + values.each do |v| + model.substance_ids << substance.id.to_s + model.dependent_variables << v + end if values + end + + # parse fingerprints + if model.fingerprints? + model.algorithms[:descriptors].each do |type| + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! + end + end + model.descriptor_ids = model.fingerprints.flatten.uniq + model.descriptor_ids.each do |d| + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} + end + else + # parse independent_variables + if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty? + properties = model.substances.collect { |s| s.properties } + all_property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + + # calculate physchem properties + else + properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) } + model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} + end + end + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] - model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] + model = Algorithm.run model.algorithms[:feature_selection][:method], model + end + + # scale independent_variables + unless model.fingerprints? + model.independent_variables.each_with_index do |var,i| + model.descriptor_means[i] = var.mean + model.descriptor_sds[i] = var.standard_deviation + model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} + end end model.save model end def predict_substance substance - neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features - measurements = nil - prediction = {} - # handle query substance - if neighbors.collect{|n| n["_id"]}.include? substance.id - - query = neighbors.select{|n| n["_id"] == substance.id}.first - measurements = training_dataset.values(query["_id"],prediction_feature_id) - prediction[:measurements] = measurements - prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance." - neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) + + case algorithms[:similarity][:method] + when /tanimoto/ # binary features + similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq + # TODO this excludes descriptors only present in the query substance + query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} + when /euclid|cosine/ # quantitative features + similarity_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + (prop-descriptor_means[i])/descriptor_sds[i] + } + query_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + substance.properties[id] + } + else + bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - if neighbors.empty? - prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) - elsif neighbors.size == 1 - value = nil - m = neighbors.first["measurements"] - if m.size == 1 # single measurement - value = m.first - else # multiple measurement - if m.collect{|t| t.numeric?}.uniq == [true] # numeric - value = m.median - elsif m.uniq.size == 1 # single value - value = m.first - else # contradictory results - # TODO add majority vote?? + + prediction = {} + neighbor_ids = [] + neighbor_similarities = [] + neighbor_dependent_variables = [] + neighbor_independent_variables = [] + + prediction = {} + # find neighbors + substance_ids.each_with_index do |s,i| + # handle query substance + if substance.id.to_s == s + prediction[:measurements] ||= [] + prediction[:measurements] << dependent_variables[i] + prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." + else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core + if fingerprints? + neighbor_descriptors = fingerprints[i] + else + neighbor_descriptors = scaled_variables.collect{|v| v[i]} + end + sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] + if sim > algorithms[:similarity][:min] + neighbor_ids << s + neighbor_similarities << sim + neighbor_dependent_variables << dependent_variables[i] + independent_variables.each_with_index do |c,j| + neighbor_independent_variables[j] ||= [] + neighbor_independent_variables[j] << independent_variables[j][i] + end end end - prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value + end + + measurements = nil + + if neighbor_similarities.empty? + prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbor_similarities.size == 1 + prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else # call prediction algorithm - case algorithms[:descriptors][:method] - when "fingerprint" - descriptors = substance.fingerprints[algorithms[:descriptors][:type]] - when "properties" - descriptors = substance.properties - else - bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." - end - params = { - :method => algorithms[:prediction][:parameters], - :descriptors => descriptors, - :neighbors => neighbors, - :relevant_features => relevant_features - } - result = Algorithm.run algorithms[:prediction][:method], params + result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors + p result prediction.merge! result - prediction[:neighbors] = neighbors - prediction[:neighbors] ||= [] + prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end prediction end @@ -221,6 +286,18 @@ module OpenTox Feature.find(prediction_feature_id) end + def descriptors + descriptor_ids.collect{|id| Feature.find(id)} + end + + def substances + substance_ids.collect{|id| Substance.find(id)} + end + + def fingerprints? + algorithms[:similarity][:method].match("tanimoto") ? true : false + end + end class LazarClassification < Lazar -- cgit v1.2.3 From 8d325866dd7cacdd04bd2306a9144a5e7300c7c8 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 10:11:09 +0200 Subject: molecular_weight fixed --- lib/model.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index f3f0603..859df8b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -126,7 +126,8 @@ module OpenTox end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| - model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} + # resulting model may break BSON size limit (e.g. f Kazius dataset + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end else # parse independent_variables @@ -225,7 +226,6 @@ module OpenTox else # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors - p result prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end -- cgit v1.2.3 From 9e99495ecbff147218023c136bade9e56a502fed Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 14:39:04 +0200 Subject: descriptor tests fixed --- lib/model.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 859df8b..7029c31 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -131,7 +131,7 @@ module OpenTox end else # parse independent_variables - if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty? + if (model.algorithms[:descriptors] & [PhysChem::OPENBABEL,PhysChem::CDK,PhysChem::JOELIB]).empty? properties = model.substances.collect { |s| s.properties } all_property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } @@ -139,7 +139,7 @@ module OpenTox # calculate physchem properties else - properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) } + properties = model.substances.collect { |s| s.calculate_properties(model.algorithms[:descriptors]) } model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} end -- cgit v1.2.3 From ad7ec6a1e33f69557fe64371581d5f42a65ecaa8 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 17:34:31 +0200 Subject: classification fixed --- lib/model.rb | 63 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 25 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 7029c31..b949042 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -57,7 +57,10 @@ module OpenTox if substance_classes.first == "OpenTox::Compound" model.algorithms = { - :descriptors => ['MP2D'], + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -77,7 +80,10 @@ module OpenTox elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { - :descriptors => ["P-CHEM"], + :descriptors => { + :method => "properties", + :category => "P-CHEM", + }, #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", @@ -115,34 +121,41 @@ module OpenTox end if values end + descriptor_method = model.algorithms[:descriptors][:method] + case descriptor_method # parse fingerprints - if model.fingerprints? - model.algorithms[:descriptors].each do |type| - model.substances.each_with_index do |s,i| - model.fingerprints[i] ||= [] - model.fingerprints[i] += s.fingerprint(type) - model.fingerprints[i].uniq! - end + when "fingerprint" + type = model.algorithms[:descriptors][:type] + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| - # resulting model may break BSON size limit (e.g. f Kazius dataset + # resulting model may break BSON size limit (e.g. f Kazius dataset) model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end - else - # parse independent_variables - if (model.algorithms[:descriptors] & [PhysChem::OPENBABEL,PhysChem::CDK,PhysChem::JOELIB]).empty? - properties = model.substances.collect { |s| s.properties } - all_property_ids = properties.collect{|p| p.keys}.flatten.uniq - model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} - - # calculate physchem properties - else - properties = model.substances.collect { |s| s.calculate_properties(model.algorithms[:descriptors]) } - model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} + # calculate physchem properties + when "calculate_properties" + features = model.algorithms[:descriptors][:features] + model.descriptor_ids = features.collect{|f| f.id.to_s} + model.algorithms[:descriptors].delete(:features) + model.algorithms[:descriptors].delete(:type) + model.substances.each_with_index do |s,i| + s.calculate_properties(features).each_with_index do |v,j| + model.independent_variables[j] ||= [] + model.independent_variables[j][i] = v + end end + # parse independent_variables + when "properties" + properties = model.substances.collect { |s| s.properties } + all_property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + else + bad_request_error "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] @@ -165,7 +178,7 @@ module OpenTox case algorithms[:similarity][:method] when /tanimoto/ # binary features - similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq + similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features @@ -295,7 +308,7 @@ module OpenTox end def fingerprints? - algorithms[:similarity][:method].match("tanimoto") ? true : false + algorithms[:descriptors][:method] == "fingerprint" ? true : false end end -- cgit v1.2.3 From 160e75e696452ac61e651664ac56d16ce1c9c4b6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 19:17:03 +0200 Subject: model tests separated and cleaned --- lib/model.rb | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index b949042..4bbb7da 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -82,7 +82,7 @@ module OpenTox model.algorithms = { :descriptors => { :method => "properties", - :category => "P-CHEM", + :categories => ["P-CHEM"], }, #:descriptors => ["P-CHEM","Proteomics"], :similarity => { @@ -150,9 +150,14 @@ module OpenTox end # parse independent_variables when "properties" + categories = model.algorithms[:descriptors][:categories] + feature_ids = [] + categories.each do |category| + Feature.where(category:category).each{|f| feature_ids << f.id.to_s} + end properties = model.substances.collect { |s| s.properties } - all_property_ids = properties.collect{|p| p.keys}.flatten.uniq - model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else bad_request_error "Descriptor method '#{descriptor_method}' not implemented." @@ -180,18 +185,25 @@ module OpenTox when /tanimoto/ # binary features similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance + # use for applicability domain? query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features - similarity_descriptors = descriptor_ids.collect_with_index{|id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - (prop-descriptor_means[i])/descriptor_sds[i] - } - query_descriptors = descriptor_ids.collect_with_index{|id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - substance.properties[id] - } + if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors + features = descriptor_ids.collect{|id| Feature.find(id)} + query_descriptors = substance.calculate_properties(features) + similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} + else + similarity_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + (prop-descriptor_means[i])/descriptor_sds[i] + } + query_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + substance.properties[id] + } + end else bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end @@ -218,7 +230,7 @@ module OpenTox neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] - if sim > algorithms[:similarity][:min] + if sim >= algorithms[:similarity][:min] neighbor_ids << s neighbor_similarities << sim neighbor_dependent_variables << dependent_variables[i] -- cgit v1.2.3 From 09452bba5c407c27721223d126e3f45c12b20a0c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 22:59:45 +0200 Subject: tests pass --- lib/model.rb | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 4bbb7da..d7b072f 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,23 +28,9 @@ module OpenTox bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset prediction_feature = training_dataset.features.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data - # explicit prediction algorithm - if algorithms[:prediction] and algorithms[:prediction][:method] - case algorithms[:prediction][:method] - when /Classification/i - model = LazarClassification.new - when /Regression/i - model = LazarRegression.new - else - bad_request_error "Prediction method '#{algorithms[:prediction][:method]}' not implemented." - end # guess model type - elsif prediction_feature.numeric? - model = LazarRegression.new - else - model = LazarClassification.new - end + prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id @@ -193,17 +179,17 @@ module OpenTox query_descriptors = substance.calculate_properties(features) similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} else - similarity_descriptors = descriptor_ids.collect_with_index{|id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - (prop-descriptor_means[i])/descriptor_sds[i] - } - query_descriptors = descriptor_ids.collect_with_index{|id,i| + similarity_descriptors = [] + query_descriptors = [] + descriptor_ids.each_with_index do |id,i| prop = substance.properties[id] prop = prop.median if prop.is_a? Array # measured - substance.properties[id] - } + if prop + similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] + query_descriptors[i] = prop + end end + end else bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end -- cgit v1.2.3 From fbded88db8b51f41ffbd5a02f601e4538ec87258 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 14 Oct 2016 09:55:51 +0200 Subject: git commit added to model metadata --- lib/model.rb | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index d7b072f..7503215 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -23,6 +23,7 @@ module OpenTox field :descriptor_means, type: Array, default:[] field :descriptor_sds, type: Array, default:[] field :scaled_variables, type: Array, default:[] + field :version, type: Hash, default:{} def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset @@ -35,6 +36,16 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = "#{prediction_feature.name} (#{training_dataset.name})" + # TODO: check if this works for gem version, add gem versioning? + dir = File.dirname(__FILE__) + commit = `cd #{dir}; git rev-parse HEAD`.chomp + branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp + url = `cd #{dir}; git config --get remote.origin.url`.chomp + if branch + model.version = {:url => url, :branch => branch, :commit => commit} + else + model.version = {:warning => "git is not installed"} + end # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq -- cgit v1.2.3 From 295dcfc74e1375e495ec3d9c1e74a402eb4decd4 Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 10 Nov 2016 11:06:27 +0000 Subject: added nanomodel create --- lib/model.rb | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 7503215..adcbcc6 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -418,6 +418,28 @@ module OpenTox prediction_model end + def self.create dir: dir, algorithms: algorithms + training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.import dir + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + end + prediction_model = self.new( + :endpoint => "log2(Net cell association)", + :source => "https://data.enanomapper.net/", + :species => "A549 human lung epithelial carcinoma cells", + :unit => "log2(ug/Mg)" + ) + prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first + model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) + prediction_model[:model_id] = model.id + repeated_cv = Validation::RepeatedCrossValidation.create model + prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + prediction_model.save + prediction_model + end + end end -- cgit v1.2.3 From 9e7b36613e98601de7b2ceb2d4442e11f1ae868a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 10 Nov 2016 12:23:46 +0100 Subject: intermediate commit, may be defunct --- lib/model.rb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 7503215..6a5e614 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -81,7 +81,6 @@ module OpenTox :method => "properties", :categories => ["P-CHEM"], }, - #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5 @@ -140,10 +139,11 @@ module OpenTox model.algorithms[:descriptors].delete(:features) model.algorithms[:descriptors].delete(:type) model.substances.each_with_index do |s,i| - s.calculate_properties(features).each_with_index do |v,j| + props = s.calculate_properties(features) + props.each_with_index do |v,j| model.independent_variables[j] ||= [] model.independent_variables[j][i] = v - end + end if props and !props.empty? end # parse independent_variables when "properties" @@ -152,7 +152,10 @@ module OpenTox categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end - properties = model.substances.collect { |s| s.properties } + #p feature_ids + #properties = Nanoparticle.all.collect { |s| p s.name; p s.id; p s.properties } + properties = model.substances.collect { |s| s.properties } + #p properties property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} -- cgit v1.2.3 From 9a06f2ff5ae6bdbe7dc90555599e186f1585e0d2 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 10 Nov 2016 15:27:26 +0100 Subject: Model::NanoPrediction parameters --- lib/model.rb | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 549cbd2..809dc48 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -106,7 +106,7 @@ module OpenTox else model.algorithms[type] = parameters end - end + end if algorithms # parse dependent_variables from training dataset training_dataset.substances.each do |substance| @@ -249,6 +249,7 @@ module OpenTox elsif neighbor_similarities.size == 1 prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else + query_descriptors.collect!{|d| d ? 1 : 0} if independent_variables[0][0].numeric? # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result @@ -343,7 +344,7 @@ module OpenTox field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId - field :leave_one_out_validation_id, type: BSON::ObjectId + #field :leave_one_out_validation_id, type: BSON::ObjectId def predict object model.predict object @@ -398,42 +399,28 @@ module OpenTox class NanoPrediction < Prediction - def self.from_json_dump dir, category - Import::Enanomapper.import dir - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + def self.create training_dataset: nil, prediction_feature:nil, algorithms: nil + + # find/import training_dataset + training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset # try to import from json dump + Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.mirror + Import::Enanomapper.import + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset + end end - prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", - :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" - ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first - model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset) - prediction_model[:model_id] = model.id - repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id - prediction_model.save - prediction_model - end + prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - def self.create dir: dir, algorithms: algorithms - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import dir - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - end prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", + :endpoint => prediction_feature.name, + :source => prediction_feature.source, :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" + :unit => prediction_feature.unit ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model -- cgit v1.2.3 From b6116bc4705066da30668ff3370f3b1c307e44e7 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 11 Nov 2016 13:07:53 +0100 Subject: enm import fixed --- lib/model.rb | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 809dc48..9be0fa0 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -152,10 +152,7 @@ module OpenTox categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end - #p feature_ids - #properties = Nanoparticle.all.collect { |s| p s.name; p s.id; p s.properties } properties = model.substances.collect { |s| s.properties } - #p properties property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} @@ -223,10 +220,10 @@ module OpenTox prediction[:measurements] << dependent_variables[i] prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else - next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core if fingerprints? neighbor_descriptors = fingerprints[i] else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] @@ -344,7 +341,6 @@ module OpenTox field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId - #field :leave_one_out_validation_id, type: BSON::ObjectId def predict object model.predict object @@ -370,10 +366,6 @@ module OpenTox repeated_crossvalidation.crossvalidations end - def leave_one_out_validation - Validation::LeaveOneOut.find leave_one_out_validation_id - end - def regression? model.is_a? LazarRegression end @@ -390,7 +382,6 @@ module OpenTox model = Lazar.create training_dataset: training_dataset prediction_model[:model_id] = model.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save prediction_model end @@ -406,12 +397,7 @@ module OpenTox unless training_dataset # try to import from json dump Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.mirror - Import::Enanomapper.import - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset - end + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first @@ -424,8 +410,7 @@ module OpenTox model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + prediction_model[:repeated_crossvalidation_id] = repeated_cv.id prediction_model.save prediction_model end -- cgit v1.2.3 From 99c42f76b02f9084d0757eb0c52b4a55fa295a95 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 11 Nov 2016 17:19:13 +0100 Subject: p-chem regression and enm import fixed --- lib/model.rb | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 9be0fa0..9ed3210 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -102,6 +102,7 @@ module OpenTox parameters.each do |p,v| model.algorithms[type] ||= {} model.algorithms[type][p] = v + model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type end else model.algorithms[type] = parameters @@ -246,7 +247,7 @@ module OpenTox elsif neighbor_similarities.size == 1 prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else - query_descriptors.collect!{|d| d ? 1 : 0} if independent_variables[0][0].numeric? + query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result @@ -329,7 +330,7 @@ module OpenTox class LazarRegression < Lazar end - class Prediction + class Validation include OpenTox include Mongoid::Document @@ -377,20 +378,16 @@ module OpenTox def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file - prediction_model = self.new JSON.parse(File.read(metadata_file)) + model_validation = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file model = Lazar.create training_dataset: training_dataset - prediction_model[:model_id] = model.id - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - prediction_model.save - prediction_model + model_validation[:model_id] = model.id + model_validation[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + model_validation.save + model_validation end - end - - class NanoPrediction < Prediction - - def self.create training_dataset: nil, prediction_feature:nil, algorithms: nil + def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil # find/import training_dataset training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first @@ -401,18 +398,18 @@ module OpenTox end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - prediction_model = self.new( + model_validation = self.new( :endpoint => prediction_feature.name, :source => prediction_feature.source, :species => "A549 human lung epithelial carcinoma cells", :unit => prediction_feature.unit ) model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) - prediction_model[:model_id] = model.id + model_validation[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = repeated_cv.id - prediction_model.save - prediction_model + model_validation[:repeated_crossvalidation_id] = repeated_cv.id + model_validation.save + model_validation end end -- cgit v1.2.3 From 2baffb4a3ebfa2b4a32c0c148bf61a5da89ec210 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 25 Nov 2016 10:36:02 +0100 Subject: algorithms accessor for Model::Validation --- lib/model.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 9ed3210..e8b30ca 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -355,6 +355,10 @@ module OpenTox Lazar.find model_id end + def algorithms + model.algorithms + end + def prediction_feature model.prediction_feature end @@ -404,7 +408,7 @@ module OpenTox :species => "A549 human lung epithelial carcinoma cells", :unit => prediction_feature.unit ) - model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) + model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms model_validation[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model model_validation[:repeated_crossvalidation_id] = repeated_cv.id -- cgit v1.2.3 From 4570f11444bc10da88d849e9a2812e95a8933c8a Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 6 Dec 2016 09:59:24 +0000 Subject: full class name required --- lib/model.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index e8b30ca..38c1915 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -364,7 +364,8 @@ module OpenTox end def repeated_crossvalidation - Validation::RepeatedCrossValidation.find repeated_crossvalidation_id + # full class name required + OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end def crossvalidations @@ -386,7 +387,8 @@ module OpenTox training_dataset = Dataset.from_csv_file file model = Lazar.create training_dataset: training_dataset model_validation[:model_id] = model.id - model_validation[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + # full class name required + model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id model_validation.save model_validation end -- cgit v1.2.3