From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 12 Oct 2016 21:32:27 +0200 Subject: physchem regression, correlation_filter for fingerprints --- lib/model.rb | 197 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 137 insertions(+), 60 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 290309a..f3f0603 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -11,10 +11,18 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ + field :algorithms, type: Hash, default:{} field :training_dataset_id, type: BSON::ObjectId + field :substance_ids, type: Array, default:[] field :prediction_feature_id, type: BSON::ObjectId - field :algorithms, type: Hash - field :relevant_features, type: Hash + field :dependent_variables, type: Array, default:[] + field :descriptor_ids, type:Array, default:[] + field :independent_variables, type: Array, default:[] + field :fingerprints, type: Array, default:[] + field :descriptor_weights, type: Array, default:[] + field :descriptor_means, type: Array, default:[] + field :descriptor_sds, type: Array, default:[] + field :scaled_variables, type: Array, default:[] def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset @@ -40,7 +48,7 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id - model.name = "#{training_dataset.name} #{prediction_feature.name}" + model.name = "#{prediction_feature.name} (#{training_dataset.name})" # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq @@ -49,10 +57,7 @@ module OpenTox if substance_classes.first == "OpenTox::Compound" model.algorithms = { - :descriptors => { - :method => "fingerprint", - :type => 'MP2D', - }, + :descriptors => ['MP2D'], :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -66,25 +71,20 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Caret.regression", - :parameters => "pls", + :method => "Algorithm::Caret.pls", } end elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { - :descriptors => { - :method => "properties", - #:types => ["P-CHEM","Proteomics"], - :types => ["P-CHEM"], - }, + :descriptors => ["P-CHEM"], + #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5 }, :prediction => { - :method => "Algorithm::Caret.regression", - :parameters => "rf", + :method => "Algorithm::Caret.rf", }, :feature_selection => { :method => "Algorithm::FeatureSelection.correlation_filter", @@ -106,63 +106,128 @@ module OpenTox end end + # parse dependent_variables from training dataset + training_dataset.substances.each do |substance| + values = training_dataset.values(substance,model.prediction_feature_id) + values.each do |v| + model.substance_ids << substance.id.to_s + model.dependent_variables << v + end if values + end + + # parse fingerprints + if model.fingerprints? + model.algorithms[:descriptors].each do |type| + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! + end + end + model.descriptor_ids = model.fingerprints.flatten.uniq + model.descriptor_ids.each do |d| + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} + end + else + # parse independent_variables + if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty? + properties = model.substances.collect { |s| s.properties } + all_property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + + # calculate physchem properties + else + properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) } + model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} + end + end + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] - model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] + model = Algorithm.run model.algorithms[:feature_selection][:method], model + end + + # scale independent_variables + unless model.fingerprints? + model.independent_variables.each_with_index do |var,i| + model.descriptor_means[i] = var.mean + model.descriptor_sds[i] = var.standard_deviation + model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} + end end model.save model end def predict_substance substance - neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features - measurements = nil - prediction = {} - # handle query substance - if neighbors.collect{|n| n["_id"]}.include? substance.id - - query = neighbors.select{|n| n["_id"] == substance.id}.first - measurements = training_dataset.values(query["_id"],prediction_feature_id) - prediction[:measurements] = measurements - prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance." - neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) + + case algorithms[:similarity][:method] + when /tanimoto/ # binary features + similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq + # TODO this excludes descriptors only present in the query substance + query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} + when /euclid|cosine/ # quantitative features + similarity_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + (prop-descriptor_means[i])/descriptor_sds[i] + } + query_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + substance.properties[id] + } + else + bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - if neighbors.empty? - prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) - elsif neighbors.size == 1 - value = nil - m = neighbors.first["measurements"] - if m.size == 1 # single measurement - value = m.first - else # multiple measurement - if m.collect{|t| t.numeric?}.uniq == [true] # numeric - value = m.median - elsif m.uniq.size == 1 # single value - value = m.first - else # contradictory results - # TODO add majority vote?? + + prediction = {} + neighbor_ids = [] + neighbor_similarities = [] + neighbor_dependent_variables = [] + neighbor_independent_variables = [] + + prediction = {} + # find neighbors + substance_ids.each_with_index do |s,i| + # handle query substance + if substance.id.to_s == s + prediction[:measurements] ||= [] + prediction[:measurements] << dependent_variables[i] + prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." + else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core + if fingerprints? + neighbor_descriptors = fingerprints[i] + else + neighbor_descriptors = scaled_variables.collect{|v| v[i]} + end + sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] + if sim > algorithms[:similarity][:min] + neighbor_ids << s + neighbor_similarities << sim + neighbor_dependent_variables << dependent_variables[i] + independent_variables.each_with_index do |c,j| + neighbor_independent_variables[j] ||= [] + neighbor_independent_variables[j] << independent_variables[j][i] + end end end - prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value + end + + measurements = nil + + if neighbor_similarities.empty? + prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbor_similarities.size == 1 + prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else # call prediction algorithm - case algorithms[:descriptors][:method] - when "fingerprint" - descriptors = substance.fingerprints[algorithms[:descriptors][:type]] - when "properties" - descriptors = substance.properties - else - bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." - end - params = { - :method => algorithms[:prediction][:parameters], - :descriptors => descriptors, - :neighbors => neighbors, - :relevant_features => relevant_features - } - result = Algorithm.run algorithms[:prediction][:method], params + result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors + p result prediction.merge! result - prediction[:neighbors] = neighbors - prediction[:neighbors] ||= [] + prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end prediction end @@ -221,6 +286,18 @@ module OpenTox Feature.find(prediction_feature_id) end + def descriptors + descriptor_ids.collect{|id| Feature.find(id)} + end + + def substances + substance_ids.collect{|id| Substance.find(id)} + end + + def fingerprints? + algorithms[:similarity][:method].match("tanimoto") ? true : false + end + end class LazarClassification < Lazar -- cgit v1.2.3