summaryrefslogtreecommitdiff
path: root/lib/model.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-10-12 21:32:27 +0200
committerChristoph Helma <helma@in-silico.ch>2016-10-12 21:32:27 +0200
commitdc4ab1f4e64d738d6c0b70f0b690a2359685080f (patch)
tree054ae887bf978b519a95dce5dbead59bbc67a2bb /lib/model.rb
parent1ec5ad2c67f270287499980a794e51bc9a6bbd84 (diff)
physchem regression, correlation_filter for fingerprints
Diffstat (limited to 'lib/model.rb')
-rw-r--r--lib/model.rb197
1 files changed, 137 insertions, 60 deletions
diff --git a/lib/model.rb b/lib/model.rb
index 290309a..f3f0603 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -11,10 +11,18 @@ module OpenTox
field :name, type: String
field :creator, type: String, default: __FILE__
+ field :algorithms, type: Hash, default:{}
field :training_dataset_id, type: BSON::ObjectId
+ field :substance_ids, type: Array, default:[]
field :prediction_feature_id, type: BSON::ObjectId
- field :algorithms, type: Hash
- field :relevant_features, type: Hash
+ field :dependent_variables, type: Array, default:[]
+ field :descriptor_ids, type:Array, default:[]
+ field :independent_variables, type: Array, default:[]
+ field :fingerprints, type: Array, default:[]
+ field :descriptor_weights, type: Array, default:[]
+ field :descriptor_means, type: Array, default:[]
+ field :descriptor_sds, type: Array, default:[]
+ field :scaled_variables, type: Array, default:[]
def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
@@ -40,7 +48,7 @@ module OpenTox
model.prediction_feature_id = prediction_feature.id
model.training_dataset_id = training_dataset.id
- model.name = "#{training_dataset.name} #{prediction_feature.name}"
+ model.name = "#{prediction_feature.name} (#{training_dataset.name})"
# set defaults
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
@@ -49,10 +57,7 @@ module OpenTox
if substance_classes.first == "OpenTox::Compound"
model.algorithms = {
- :descriptors => {
- :method => "fingerprint",
- :type => 'MP2D',
- },
+ :descriptors => ['MP2D'],
:similarity => {
:method => "Algorithm::Similarity.tanimoto",
:min => 0.1
@@ -66,25 +71,20 @@ module OpenTox
}
elsif model.class == LazarRegression
model.algorithms[:prediction] = {
- :method => "Algorithm::Caret.regression",
- :parameters => "pls",
+ :method => "Algorithm::Caret.pls",
}
end
elsif substance_classes.first == "OpenTox::Nanoparticle"
model.algorithms = {
- :descriptors => {
- :method => "properties",
- #:types => ["P-CHEM","Proteomics"],
- :types => ["P-CHEM"],
- },
+ :descriptors => ["P-CHEM"],
+ #:descriptors => ["P-CHEM","Proteomics"],
:similarity => {
:method => "Algorithm::Similarity.weighted_cosine",
:min => 0.5
},
:prediction => {
- :method => "Algorithm::Caret.regression",
- :parameters => "rf",
+ :method => "Algorithm::Caret.rf",
},
:feature_selection => {
:method => "Algorithm::FeatureSelection.correlation_filter",
@@ -106,63 +106,128 @@ module OpenTox
end
end
+ # parse dependent_variables from training dataset
+ training_dataset.substances.each do |substance|
+ values = training_dataset.values(substance,model.prediction_feature_id)
+ values.each do |v|
+ model.substance_ids << substance.id.to_s
+ model.dependent_variables << v
+ end if values
+ end
+
+ # parse fingerprints
+ if model.fingerprints?
+ model.algorithms[:descriptors].each do |type|
+ model.substances.each_with_index do |s,i|
+ model.fingerprints[i] ||= []
+ model.fingerprints[i] += s.fingerprint(type)
+ model.fingerprints[i].uniq!
+ end
+ end
+ model.descriptor_ids = model.fingerprints.flatten.uniq
+ model.descriptor_ids.each do |d|
+ model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d}
+ end
+ else
+ # parse independent_variables
+ if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty?
+ properties = model.substances.collect { |s| s.properties }
+ all_property_ids = properties.collect{|p| p.keys}.flatten.uniq
+ model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category }
+ model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
+
+ # calculate physchem properties
+ else
+ properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) }
+ model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq
+ model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}}
+ end
+ end
+
if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
- model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types]
+ model = Algorithm.run model.algorithms[:feature_selection][:method], model
+ end
+
+ # scale independent_variables
+ unless model.fingerprints?
+ model.independent_variables.each_with_index do |var,i|
+ model.descriptor_means[i] = var.mean
+ model.descriptor_sds[i] = var.standard_deviation
+ model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
+ end
end
model.save
model
end
def predict_substance substance
- neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features
- measurements = nil
- prediction = {}
- # handle query substance
- if neighbors.collect{|n| n["_id"]}.include? substance.id
-
- query = neighbors.select{|n| n["_id"] == substance.id}.first
- measurements = training_dataset.values(query["_id"],prediction_feature_id)
- prediction[:measurements] = measurements
- prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance."
- neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation)
+
+ case algorithms[:similarity][:method]
+ when /tanimoto/ # binary features
+ similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq
+ # TODO this excludes descriptors only present in the query substance
+ query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
+ when /euclid|cosine/ # quantitative features
+ similarity_descriptors = descriptor_ids.collect_with_index{|id,i|
+ prop = substance.properties[id]
+ prop = prop.median if prop.is_a? Array # measured
+ (prop-descriptor_means[i])/descriptor_sds[i]
+ }
+ query_descriptors = descriptor_ids.collect_with_index{|id,i|
+ prop = substance.properties[id]
+ prop = prop.median if prop.is_a? Array # measured
+ substance.properties[id]
+ }
+ else
+ bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
end
- if neighbors.empty?
- prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
- elsif neighbors.size == 1
- value = nil
- m = neighbors.first["measurements"]
- if m.size == 1 # single measurement
- value = m.first
- else # multiple measurement
- if m.collect{|t| t.numeric?}.uniq == [true] # numeric
- value = m.median
- elsif m.uniq.size == 1 # single value
- value = m.first
- else # contradictory results
- # TODO add majority vote??
+
+ prediction = {}
+ neighbor_ids = []
+ neighbor_similarities = []
+ neighbor_dependent_variables = []
+ neighbor_independent_variables = []
+
+ prediction = {}
+ # find neighbors
+ substance_ids.each_with_index do |s,i|
+ # handle query substance
+ if substance.id.to_s == s
+ prediction[:measurements] ||= []
+ prediction[:measurements] << dependent_variables[i]
+ prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
+ else
+ next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core
+ if fingerprints?
+ neighbor_descriptors = fingerprints[i]
+ else
+ neighbor_descriptors = scaled_variables.collect{|v| v[i]}
+ end
+ sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
+ if sim > algorithms[:similarity][:min]
+ neighbor_ids << s
+ neighbor_similarities << sim
+ neighbor_dependent_variables << dependent_variables[i]
+ independent_variables.each_with_index do |c,j|
+ neighbor_independent_variables[j] ||= []
+ neighbor_independent_variables[j] << independent_variables[j][i]
+ end
end
end
- prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value
+ end
+
+ measurements = nil
+
+ if neighbor_similarities.empty?
+ prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
+ elsif neighbor_similarities.size == 1
+ prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
else
# call prediction algorithm
- case algorithms[:descriptors][:method]
- when "fingerprint"
- descriptors = substance.fingerprints[algorithms[:descriptors][:type]]
- when "properties"
- descriptors = substance.properties
- else
- bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available."
- end
- params = {
- :method => algorithms[:prediction][:parameters],
- :descriptors => descriptors,
- :neighbors => neighbors,
- :relevant_features => relevant_features
- }
- result = Algorithm.run algorithms[:prediction][:method], params
+ result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
+ p result
prediction.merge! result
- prediction[:neighbors] = neighbors
- prediction[:neighbors] ||= []
+ prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
end
prediction
end
@@ -221,6 +286,18 @@ module OpenTox
Feature.find(prediction_feature_id)
end
+ def descriptors
+ descriptor_ids.collect{|id| Feature.find(id)}
+ end
+
+ def substances
+ substance_ids.collect{|id| Substance.find(id)}
+ end
+
+ def fingerprints?
+ algorithms[:similarity][:method].match("tanimoto") ? true : false
+ end
+
end
class LazarClassification < Lazar