summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-10-12 21:32:27 +0200
committerChristoph Helma <helma@in-silico.ch>2016-10-12 21:32:27 +0200
commitdc4ab1f4e64d738d6c0b70f0b690a2359685080f (patch)
tree054ae887bf978b519a95dce5dbead59bbc67a2bb
parent1ec5ad2c67f270287499980a794e51bc9a6bbd84 (diff)
physchem regression, correlation_filter for fingerprints
-rw-r--r--lib/caret.rb184
-rw-r--r--lib/classification.rb23
-rw-r--r--lib/compound.rb48
-rw-r--r--lib/feature_selection.rb60
-rw-r--r--lib/model.rb197
-rw-r--r--lib/overwrite.rb13
-rw-r--r--lib/physchem.rb14
-rw-r--r--lib/regression.rb15
-rw-r--r--lib/similarity.rb25
-rw-r--r--lib/substance.rb60
-rw-r--r--test/model.rb41
-rw-r--r--test/regression.rb37
12 files changed, 328 insertions, 389 deletions
diff --git a/lib/caret.rb b/lib/caret.rb
index b999b06..59e02da 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -5,33 +5,56 @@ module OpenTox
# TODO classification
# model list: https://topepo.github.io/caret/modelList.html
- attr_accessor :descriptors, :neighbors, :method, :relevant_features, :data_frame, :feature_names, :weights, :query_features
-
- def initialize descriptors:, neighbors:, method:, relevant_features:
- @descriptors = descriptors
- @neighbors = neighbors
- @method = method
- @relevant_features = relevant_features
- end
-
- def self.regression descriptors:, neighbors:, method:, relevant_features:nil
-
- caret = new(descriptors:descriptors, neighbors:neighbors, method:method, relevant_features:relevant_features)
- # collect training data for R
- if descriptors.is_a? Array
- caret.fingerprint2R
- elsif descriptors.is_a? Hash
- caret.properties2R
- else
- bad_request_error "Descriptors should be a fingerprint (Array) or properties (Hash). Cannot handle '#{descriptors.class}'."
- end
- if caret.feature_names.empty? or caret.data_frame.flatten.uniq == ["NA"]
- prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors)
+ def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
+ if independent_variables.flatten.uniq == ["NA"]
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
else
- prediction = caret.r_model_prediction
+ dependent_variables.each_with_index do |v,i|
+ dependent_variables[i] = to_r(v)
+ end
+ independent_variables.each_with_index do |c,i|
+ c.each_with_index do |v,j|
+ independent_variables[i][j] = to_r(v)
+ end
+ end
+ query_variables.each_with_index do |v,i|
+ query_variables[i] = to_r(v)
+ end
+ begin
+ R.assign "weights", weights
+ r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+ R.eval "data <- #{r_data_frame}"
+ R.assign "features", (0..independent_variables.size-1).to_a
+ R.eval "names(data) <- append(c('activities'),features)" #
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
+ rescue => e
+ $logger.debug "R caret model creation error for:"
+ $logger.debug JSON.pretty_generate(dependent_variables)
+ $logger.debug JSON.pretty_generate(independent_variables)
+ return {:value => nil, :warning => "R caret model cration error."}
+ end
+ begin
+ R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))"
+ R.eval "names(query) <- features"
+ R.eval "prediction <- predict(model,query)"
+ value = R.eval("prediction").to_f
+ rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
+ r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
+ prediction_interval = value-1.96*rmse, value+1.96*rmse
+ prediction = {
+ :value => value,
+ :rmse => rmse,
+ :r_squared => r_squared,
+ :prediction_interval => prediction_interval
+ }
+ rescue => e
+ $logger.debug "R caret prediction error for:"
+ $logger.debug self.inspect
+ return nil
+ end
if prediction.nil? or prediction[:value].nil?
- prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors)
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
end
end
@@ -39,111 +62,18 @@ module OpenTox
end
- def fingerprint2R
-
- values = []
- features = {}
- @weights = []
- descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort
-
- neighbors.each do |n|
- activities = n["measurements"]
- activities.each do |act|
- values << act
- @weights << n["similarity"]
- descriptor_ids.each do |id|
- features[id] ||= []
- features[id] << n["descriptors"].include?(id)
- end
- end if activities
- end
-
- @feature_names = []
- @data_frame = [values]
-
- features.each do |k,v|
- unless v.uniq.size == 1
- @data_frame << v.collect{|m| m ? "T" : "F"}
- @feature_names << k
- end
- end
- @query_features = @feature_names.collect{|f| descriptors.include?(f) ? "T" : "F"}
-
+ # call caret methods dynamically, e.g. Caret.pls
+ def self.method_missing(sym, *args, &block)
+ args.first[:method] = sym.to_s
+ self.create_model_and_predict args.first
end
-
- def properties2R
-
- @weights = []
- @feature_names = []
- @query_features = []
-
- # keep only descriptors with values
- @relevant_features.keys.each_with_index do |f,i|
- if @descriptors[f]
- @feature_names << f
- @query_features << @descriptors[f].median
- else
- neighbors.each do |n|
- n["descriptors"].delete_at i
- end
- end
- end
-
- measurements = neighbors.collect{|n| n["measurements"]}.flatten
- # initialize data frame with 'NA' defaults
- @data_frame = Array.new(@feature_names.size+1){Array.new(measurements.size,"NA") }
-
- i = 0
- # parse neighbor activities and descriptors
- neighbors.each do |n|
- activities = n["measurements"]
- activities.each do |act| # multiple measurements are treated as separate instances
- unless n["descriptors"].include?(nil)
- data_frame[0][i] = act
- @weights << n["similarity"]
- n["descriptors"].each_with_index do |d,j|
- @data_frame[j+1][i] = d
- end
- i += 1
- end
- end if activities # ignore neighbors without measurements
- end
-
- end
-
- def r_model_prediction
- begin
- R.assign "weights", @weights
- r_data_frame = "data.frame(#{@data_frame.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
- R.eval "data <- #{r_data_frame}"
- R.assign "features", @feature_names
- R.eval "names(data) <- append(c('activities'),features)" #
- R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
- rescue => e
- $logger.debug "R caret model creation error for:"
- $logger.debug JSON.pretty_generate(self.inspect)
- return nil
- end
- begin
- R.eval "query <- data.frame(rbind(c(#{@query_features.join ','})))"
- R.eval "names(query) <- features"
- R.eval "prediction <- predict(model,query)"
- value = R.eval("prediction").to_f
- rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
- r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
- prediction_interval = value-1.96*rmse, value+1.96*rmse
- {
- :value => value,
- :rmse => rmse,
- :r_squared => r_squared,
- :prediction_interval => prediction_interval
- }
- rescue => e
- $logger.debug "R caret prediction error for:"
- $logger.debug self.inspect
- return nil
- end
+ def self.to_r v
+ return "F" if v == false
+ return "T" if v == true
+ return "NA" if v.nil?
+ return "NA" if v.is_a? Float and v.nan?
+ v
end
end
diff --git a/lib/classification.rb b/lib/classification.rb
index 6582e7d..e8c179f 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -3,24 +3,17 @@ module OpenTox
class Classification
- def self.weighted_majority_vote descriptors:nil, neighbors:, method:nil, relevant_features:nil
- sims = {}
- neighbors.each do |neighbor|
- sim = neighbor["similarity"]
- activities = neighbor["measurements"]
- activities.each do |act|
- sims[act] ||= []
- sims[act] << sim
- end if activities
+ def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:
+ class_weights = {}
+ dependent_variables.each_with_index do |v,i|
+ class_weights[v] ||= []
+ class_weights[v] << weights[i] unless v.nil?
end
- sim_all = sims.collect{|a,s| s}.flatten
- sim_sum = sim_all.sum
- sim_max = sim_all.max
probabilities = {}
- sims.each do |a,s|
- probabilities[a] = s.sum/sim_sum
+ class_weights.each do |a,w|
+ probabilities[a] = w.sum/weights.sum
end
- probabilities = probabilities.collect{|a,p| [a,sim_max*p]}.to_h
+ probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
p_max = probabilities.collect{|a,p| p}.max
prediction = probabilities.key(p_max)
{:value => prediction,:probabilities => probabilities}
diff --git a/lib/compound.rb b/lib/compound.rb
index 93cfc03..0f178ce 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -75,7 +75,11 @@ module OpenTox
fingerprints[type]
end
- def calculated_physchem descriptors=PhysChem.openbabel_descriptors
+ def calculated_properties types=["OPENBABEL"]
+ descriptors = []
+ types.each do |t|
+ descriptors += PhysChem.descriptors OpenTox.const_get(t)
+ end
# TODO: speedup java descriptors
calculated_ids = properties.keys
# BSON::ObjectId instances are not allowed as keys in a BSON document.
@@ -254,48 +258,6 @@ module OpenTox
self["chemblid"]
end
-=begin
- def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:)
- neighbors = []
- dataset = Dataset.find(dataset_id)
- # TODO: fix db_neighbors
-# if type == DEFAULT_FINGERPRINT
-# neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id)
-# neighbors.each do |n|
-# n["measurements"] = dataset.values(n["_id"],prediction_feature_id)
-# end
-# else
- query_fingerprint = self.fingerprint type
- dataset.compounds.each do |compound|
- values = dataset.values(compound,prediction_feature_id)
- if values
- candidate_fingerprint = compound.fingerprint type
- sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint)
- neighbors << {"_id" => compound.id, "measurements" => values, "similarity" => sim} if sim >= min_sim
- end
-# end
- end
- neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]}
- end
-=end
-
-# def physchem_neighbors params
-# # TODO: fix, tests
-# feature_dataset = Dataset.find params[:feature_dataset_id]
-# query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
-# neighbors = []
-# feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
-# # TODO implement pearson and cosine similarity separatly
-# R.assign "x", query_fingerprint
-# R.assign "y", candidate_fingerprint
-# sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
-# if sim >= params[:min_sim]
-# neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
-# end
-# end
-# neighbors
-# end
-
def db_neighbors min_sim: 0.1, dataset_id:
p fingerprints[DEFAULT_FINGERPRINT]
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb
index 43e3bea..f599539 100644
--- a/lib/feature_selection.rb
+++ b/lib/feature_selection.rb
@@ -3,41 +3,39 @@ module OpenTox
class FeatureSelection
- def self.correlation_filter dataset:, prediction_feature:, types:nil
- # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
+ def self.correlation_filter model
relevant_features = {}
- measurements = []
- substances = []
- dataset.substances.each do |s|
- dataset.values(s,prediction_feature).each do |act|
- measurements << act
- substances << s
- end
- end
- R.assign "tox", measurements
- feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq
- feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types
- feature_ids.each do |feature_id|
- feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]}
- unless feature_values.uniq.size == 1
- R.assign "feature", feature_values
- begin
- R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
- pvalue = R.eval("cor$p.value").to_ruby
- if pvalue <= 0.05
- r = R.eval("cor$estimate").to_ruby
- relevant_features[feature_id] = {}
- relevant_features[feature_id]["pvalue"] = pvalue
- relevant_features[feature_id]["r"] = r
- relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
- relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
- end
- rescue
- warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
+ R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
+ model.descriptor_weights = []
+ selected_variables = []
+ selected_descriptor_ids = []
+ model.independent_variables.each_with_index do |v,i|
+ R.assign "independent", v.collect{|n| to_r(n)}
+ begin
+ R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
+ selected_variables << v
+ selected_descriptor_ids << model.descriptor_ids[i]
end
+ rescue
+ #warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with '#{Feature.find(model.descriptor_ids[i]).name}' (#{v}) failed."
+ warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
end
end
- relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+
+ model.independent_variables = selected_variables
+ model.descriptor_ids = selected_descriptor_ids
+ model
+ end
+
+ def self.to_r v
+ return 0 if v == false
+ return 1 if v == true
+ return "NA" if v.nil?
+ return "NA" if v.is_a? Float and v.nan?
+ v
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 290309a..f3f0603 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -11,10 +11,18 @@ module OpenTox
field :name, type: String
field :creator, type: String, default: __FILE__
+ field :algorithms, type: Hash, default:{}
field :training_dataset_id, type: BSON::ObjectId
+ field :substance_ids, type: Array, default:[]
field :prediction_feature_id, type: BSON::ObjectId
- field :algorithms, type: Hash
- field :relevant_features, type: Hash
+ field :dependent_variables, type: Array, default:[]
+ field :descriptor_ids, type:Array, default:[]
+ field :independent_variables, type: Array, default:[]
+ field :fingerprints, type: Array, default:[]
+ field :descriptor_weights, type: Array, default:[]
+ field :descriptor_means, type: Array, default:[]
+ field :descriptor_sds, type: Array, default:[]
+ field :scaled_variables, type: Array, default:[]
def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
@@ -40,7 +48,7 @@ module OpenTox
model.prediction_feature_id = prediction_feature.id
model.training_dataset_id = training_dataset.id
- model.name = "#{training_dataset.name} #{prediction_feature.name}"
+ model.name = "#{prediction_feature.name} (#{training_dataset.name})"
# set defaults
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
@@ -49,10 +57,7 @@ module OpenTox
if substance_classes.first == "OpenTox::Compound"
model.algorithms = {
- :descriptors => {
- :method => "fingerprint",
- :type => 'MP2D',
- },
+ :descriptors => ['MP2D'],
:similarity => {
:method => "Algorithm::Similarity.tanimoto",
:min => 0.1
@@ -66,25 +71,20 @@ module OpenTox
}
elsif model.class == LazarRegression
model.algorithms[:prediction] = {
- :method => "Algorithm::Caret.regression",
- :parameters => "pls",
+ :method => "Algorithm::Caret.pls",
}
end
elsif substance_classes.first == "OpenTox::Nanoparticle"
model.algorithms = {
- :descriptors => {
- :method => "properties",
- #:types => ["P-CHEM","Proteomics"],
- :types => ["P-CHEM"],
- },
+ :descriptors => ["P-CHEM"],
+ #:descriptors => ["P-CHEM","Proteomics"],
:similarity => {
:method => "Algorithm::Similarity.weighted_cosine",
:min => 0.5
},
:prediction => {
- :method => "Algorithm::Caret.regression",
- :parameters => "rf",
+ :method => "Algorithm::Caret.rf",
},
:feature_selection => {
:method => "Algorithm::FeatureSelection.correlation_filter",
@@ -106,63 +106,128 @@ module OpenTox
end
end
+ # parse dependent_variables from training dataset
+ training_dataset.substances.each do |substance|
+ values = training_dataset.values(substance,model.prediction_feature_id)
+ values.each do |v|
+ model.substance_ids << substance.id.to_s
+ model.dependent_variables << v
+ end if values
+ end
+
+ # parse fingerprints
+ if model.fingerprints?
+ model.algorithms[:descriptors].each do |type|
+ model.substances.each_with_index do |s,i|
+ model.fingerprints[i] ||= []
+ model.fingerprints[i] += s.fingerprint(type)
+ model.fingerprints[i].uniq!
+ end
+ end
+ model.descriptor_ids = model.fingerprints.flatten.uniq
+ model.descriptor_ids.each do |d|
+ model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d}
+ end
+ else
+ # parse independent_variables
+ if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty?
+ properties = model.substances.collect { |s| s.properties }
+ all_property_ids = properties.collect{|p| p.keys}.flatten.uniq
+ model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category }
+ model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
+
+ # calculate physchem properties
+ else
+ properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) }
+ model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq
+ model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}}
+ end
+ end
+
if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
- model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types]
+ model = Algorithm.run model.algorithms[:feature_selection][:method], model
+ end
+
+ # scale independent_variables
+ unless model.fingerprints?
+ model.independent_variables.each_with_index do |var,i|
+ model.descriptor_means[i] = var.mean
+ model.descriptor_sds[i] = var.standard_deviation
+ model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
+ end
end
model.save
model
end
def predict_substance substance
- neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features
- measurements = nil
- prediction = {}
- # handle query substance
- if neighbors.collect{|n| n["_id"]}.include? substance.id
-
- query = neighbors.select{|n| n["_id"] == substance.id}.first
- measurements = training_dataset.values(query["_id"],prediction_feature_id)
- prediction[:measurements] = measurements
- prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance."
- neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation)
+
+ case algorithms[:similarity][:method]
+ when /tanimoto/ # binary features
+ similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq
+ # TODO this excludes descriptors only present in the query substance
+ query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
+ when /euclid|cosine/ # quantitative features
+ similarity_descriptors = descriptor_ids.collect_with_index{|id,i|
+ prop = substance.properties[id]
+ prop = prop.median if prop.is_a? Array # measured
+ (prop-descriptor_means[i])/descriptor_sds[i]
+ }
+ query_descriptors = descriptor_ids.collect_with_index{|id,i|
+ prop = substance.properties[id]
+ prop = prop.median if prop.is_a? Array # measured
+ substance.properties[id]
+ }
+ else
+ bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
end
- if neighbors.empty?
- prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
- elsif neighbors.size == 1
- value = nil
- m = neighbors.first["measurements"]
- if m.size == 1 # single measurement
- value = m.first
- else # multiple measurement
- if m.collect{|t| t.numeric?}.uniq == [true] # numeric
- value = m.median
- elsif m.uniq.size == 1 # single value
- value = m.first
- else # contradictory results
- # TODO add majority vote??
+
+ prediction = {}
+ neighbor_ids = []
+ neighbor_similarities = []
+ neighbor_dependent_variables = []
+ neighbor_independent_variables = []
+
+ prediction = {}
+ # find neighbors
+ substance_ids.each_with_index do |s,i|
+ # handle query substance
+ if substance.id.to_s == s
+ prediction[:measurements] ||= []
+ prediction[:measurements] << dependent_variables[i]
+ prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
+ else
+ next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core
+ if fingerprints?
+ neighbor_descriptors = fingerprints[i]
+ else
+ neighbor_descriptors = scaled_variables.collect{|v| v[i]}
+ end
+ sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
+ if sim > algorithms[:similarity][:min]
+ neighbor_ids << s
+ neighbor_similarities << sim
+ neighbor_dependent_variables << dependent_variables[i]
+ independent_variables.each_with_index do |c,j|
+ neighbor_independent_variables[j] ||= []
+ neighbor_independent_variables[j] << independent_variables[j][i]
+ end
end
end
- prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value
+ end
+
+ measurements = nil
+
+ if neighbor_similarities.empty?
+ prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
+ elsif neighbor_similarities.size == 1
+ prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
else
# call prediction algorithm
- case algorithms[:descriptors][:method]
- when "fingerprint"
- descriptors = substance.fingerprints[algorithms[:descriptors][:type]]
- when "properties"
- descriptors = substance.properties
- else
- bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available."
- end
- params = {
- :method => algorithms[:prediction][:parameters],
- :descriptors => descriptors,
- :neighbors => neighbors,
- :relevant_features => relevant_features
- }
- result = Algorithm.run algorithms[:prediction][:method], params
+ result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
+ p result
prediction.merge! result
- prediction[:neighbors] = neighbors
- prediction[:neighbors] ||= []
+ prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
end
prediction
end
@@ -221,6 +286,18 @@ module OpenTox
Feature.find(prediction_feature_id)
end
+ def descriptors
+ descriptor_ids.collect{|id| Feature.find(id)}
+ end
+
+ def substances
+ substance_ids.collect{|id| Substance.find(id)}
+ end
+
+ def fingerprints?
+ algorithms[:similarity][:method].match("tanimoto") ? true : false
+ end
+
end
class LazarClassification < Lazar
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index 4a79051..d0422ee 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -101,13 +101,13 @@ class Array
end
def mean
- self.inject{ |sum, el| sum + el }.to_f / self.size
+ self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
end
def sample_variance
m = self.mean
- sum = self.inject(0){|accum, i| accum +(i-m)**2 }
- sum/(self.length - 1).to_f
+ sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
+ sum/(self.compact.length - 1).to_f
end
def standard_deviation
@@ -123,6 +123,13 @@ class Array
end
end
+ def collect_with_index
+ result = []
+ self.each_with_index do |elt, idx|
+ result << yield(elt, idx)
+ end
+ result
+ end
end
module URI
diff --git a/lib/physchem.rb b/lib/physchem.rb
index c32e382..327acd8 100644
--- a/lib/physchem.rb
+++ b/lib/physchem.rb
@@ -14,7 +14,7 @@ module OpenTox
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
- OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+ OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
name,description = d.split(/\s+/,2)
["Openbabel."+name,description] unless obexclude.include? name
end.compact.sort{|a,b| a[0] <=> b[0]}]
@@ -25,17 +25,17 @@ module OpenTox
prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
end
- CDKDESCRIPTORS = cdkdescriptors
+ CDK = cdkdescriptors
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
# strip Joelib messages from stdout
- JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+ JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
name = d[:java_class].sub(/^joelib2.feature.types./,'')
["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
end.compact.sort{|a,b| a[0] <=> b[0]}]
- DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+ DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB))
require_relative "unique_descriptors.rb"
@@ -65,15 +65,15 @@ module OpenTox
end
def self.openbabel_descriptors
- descriptors OBDESCRIPTORS
+ descriptors OPENBABEL
end
def self.cdk_descriptors
- descriptors CDKDESCRIPTORS
+ descriptors CDK
end
def self.joelib_descriptors
- descriptors JOELIBDESCRIPTORS
+ descriptors JOELIB
end
def calculate compound
diff --git a/lib/regression.rb b/lib/regression.rb
index 0e5e06b..bed6df8 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,18 +3,15 @@ module OpenTox
class Regression
- def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil
+ def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:
+ #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil
# TODO: prediction_interval
weighted_sum = 0.0
sim_sum = 0.0
- neighbors.each do |neighbor|
- sim = neighbor["similarity"]
- activities = neighbor["measurements"]
- activities.each do |act|
- weighted_sum += sim*act
- sim_sum += sim
- end if activities
- end
+ dependent_variables.each_with_index do |v,i|
+ weighted_sum += weights[i]*dependent_variables[i]
+ sim_sum += weights[i]
+ end if dependent_variables
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
{:value => prediction}
end
diff --git a/lib/similarity.rb b/lib/similarity.rb
index b9b4571..328d42a 100644
--- a/lib/similarity.rb
+++ b/lib/similarity.rb
@@ -19,18 +19,19 @@ module OpenTox
( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
end
- def self.euclid fingerprints
- sq = fingerprints[0].zip(fingerprints[1]).map{|a,b| (a - b) ** 2}
+ def self.euclid scaled_properties
+ sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
Math.sqrt(sq.inject(0) {|s,c| s + c})
end
# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
- def self.cosine fingerprints
- Algorithm::Vector.dot_product(fingerprints[0], fingerprints[1]) / (Algorithm::Vector.magnitude(fingerprints[0]) * Algorithm::Vector.magnitude(fingerprints[1]))
+ def self.cosine scaled_properties
+ scaled_properties = remove_nils scaled_properties
+ Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
end
- def self.weighted_cosine fingerprints # [a,b,weights]
- a, b, w = fingerprints
+ def self.weighted_cosine scaled_properties # [a,b,weights]
+ a,b,w = remove_nils scaled_properties
dot_product = 0
magnitude_a = 0
magnitude_b = 0
@@ -42,6 +43,18 @@ module OpenTox
dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
end
+ def self.remove_nils scaled_properties
+ a =[]; b = []; w = []
+ (0..scaled_properties.first.size-1).each do |i|
+ if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan?
+ a << scaled_properties[0][i]
+ b << scaled_properties[1][i]
+ w << scaled_properties[2][i]
+ end
+ end
+ [a,b,w]
+ end
+
end
end
end
diff --git a/lib/substance.rb b/lib/substance.rb
index d271327..31c465e 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -5,64 +5,4 @@ module OpenTox
field :dataset_ids, type: Array, default: []
end
- def neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:nil
- # TODO enable empty dataset_id -> use complete db
- case descriptors[:method]
- when "fingerprint"
- fingerprint_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity
- when "properties"
- properties_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity, relevant_features: relevant_features
- else
- bad_request_error "Descriptor method '#{descriptors[:method]}' not implemented."
- end
- end
-
- def fingerprint_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:
- neighbors = []
- dataset = Dataset.find(dataset_id)
- dataset.substances.each do |substance|
- values = dataset.values(substance,prediction_feature_id)
- if values
- query_descriptors = self.send(descriptors[:method].to_sym, descriptors[:type])
- candidate_descriptors = substance.send(descriptors[:method].to_sym, descriptors[:type])
- sim = Algorithm.run similarity[:method], [query_descriptors, candidate_descriptors]
- neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min]
- end
- end
- neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]}
- end
-
- def properties_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:
- neighbors = []
- dataset = Dataset.find(dataset_id)
- weights = relevant_features.collect{|k,v| v["r"]**2}
- means = relevant_features.collect{|k,v| v["mean"]}
- standard_deviations = relevant_features.collect{|k,v| v["sd"]}
- query_descriptors = relevant_features.keys.collect{|i| properties[i].is_a?(Array) ? properties[i].median : nil }
- dataset.substances.each do |substance|
- values = dataset.values(substance,prediction_feature_id)
- # exclude nanoparticles with different core
- # TODO validate exclusion
- next if substance.is_a? Nanoparticle and substance.core != self.core
- if values
- candidate_descriptors = relevant_features.keys.collect{|i| substance.properties[i].is_a?(Array) ? substance.properties[i].median : nil }
- q = []
- c = []
- w = []
- (0..relevant_features.size-1).each do |i|
- # add only complete pairs
- if query_descriptors[i] and candidate_descriptors[i]
- w << weights[i]
- # scale values
- q << (query_descriptors[i] - means[i])/standard_deviations[i]
- c << (candidate_descriptors[i] - means[i])/standard_deviations[i]
- end
- end
- sim = Algorithm.run similarity[:method], [q, c, w]
- neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min]
- end
- end
- neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]}
- end
-
end
diff --git a/test/model.rb b/test/model.rb
index 02b8e73..9f30928 100644
--- a/test/model.rb
+++ b/test/model.rb
@@ -4,17 +4,13 @@ class ModelTest < MiniTest::Test
def test_default_regression
algorithms = {
- :descriptors => {
- :method => "fingerprint",
- :type => "MP2D"
- },
+ :descriptors => [ "MP2D" ],
:similarity => {
:method => "Algorithm::Similarity.tanimoto",
:min => 0.1
},
:prediction => {
- :method => "Algorithm::Caret.regression",
- :parameters => "pls",
+ :method => "Algorithm::Caret.pls",
},
:feature_selection => nil,
}
@@ -29,17 +25,13 @@ class ModelTest < MiniTest::Test
def test_regression_parameters
algorithms = {
- :descriptors => {
- :method => "fingerprint",
- :type => "MP2D"
- },
+ :descriptors => [ "MP2D" ],
:similarity => {
:method => "Algorithm::Similarity.tanimoto",
:min => 0.3
},
:prediction => {
:method => "Algorithm::Regression.weighted_average",
- :parameters => "rf",
},
:feature_selection => nil,
}
@@ -57,18 +49,22 @@ class ModelTest < MiniTest::Test
def test_physchem_regression
algorithms = {
- :descriptors => "physchem",
+ :descriptors => ["PhysChem::OPENBABEL"],
:similarity => {
- :method => "Algorithm::Similarity.weighted_cosine",
+ :method => "Algorithm::Similarity.cosine",
}
}
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
assert_kind_of Model::LazarRegression, model
- assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method]
- assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
+ assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
+ assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method]
assert_equal 0.1, model.algorithms[:similarity][:min]
assert_equal algorithms[:descriptors], model.algorithms[:descriptors]
+ prediction = model.predict training_dataset.substances[10]
+ p prediction
+ refute_nil prediction[:value]
+ # TODO test predictin
end
def test_nanoparticle_default
@@ -78,8 +74,7 @@ class ModelTest < MiniTest::Test
training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
end
model = Model::Lazar.create training_dataset: training_dataset
- assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method]
- assert_equal "rf", model.algorithms[:prediction][:parameters]
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
prediction = model.predict training_dataset.substances[14]
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
@@ -99,7 +94,7 @@ class ModelTest < MiniTest::Test
training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv")
model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
assert_kind_of Model::LazarRegression, model
- assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method]
+ assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
assert_equal 0.1, model.algorithms[:similarity][:min]
assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
@@ -111,10 +106,7 @@ class ModelTest < MiniTest::Test
def test_default_classification
algorithms = {
- :descriptors => {
- :method => "fingerprint",
- :type => 'MP2D',
- },
+ :descriptors => [ "MP2D" ],
:similarity => {
:method => "Algorithm::Similarity.tanimoto",
:min => 0.1
@@ -135,10 +127,7 @@ class ModelTest < MiniTest::Test
def test_classification_parameters
algorithms = {
- :descriptors => {
- :method => "fingerprint",
- :type => 'MACCS',
- },
+ :descriptors => ['MACCS'],
:similarity => {
:min => 0.4
},
diff --git a/test/regression.rb b/test/regression.rb
index 4c21450..aad4195 100644
--- a/test/regression.rb
+++ b/test/regression.rb
@@ -45,12 +45,45 @@ class LazarRegressionTest < MiniTest::Test
end
def test_local_physchem_regression
- skip # TODO: fix
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
- model = Model::Lazar.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
+ algorithms = {
+ :descriptors => ["PhysChem::OPENBABEL"],
+ :similarity => {
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
+ },
+ }
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
+ p model
compound = Compound.from_smiles "NC(=O)OCCC"
prediction = model.predict compound
refute_nil prediction[:value]
end
+ def test_local_physchem_regression_with_feature_selection
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
+ algorithms = {
+ :descriptors => {
+ :method => "calculated_properties",
+ :types => ["OPENBABEL"]
+ },
+ :similarity => {
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
+ },
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
+ }
+ model = Model::Lazar.create(training_dataset.features.first, training_dataset, algorithms)
+ p model
+ compound = Compound.from_smiles "NC(=O)OCCC"
+ prediction = model.predict compound
+ refute_nil prediction[:value]
+ end
+
+ def test_local_physchem_classification
+ skip
+ end
+
end