summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-10-05 13:22:12 +0200
committerChristoph Helma <helma@in-silico.ch>2016-10-05 13:22:12 +0200
commit5d4e5e463c2b87241bbb56e4658e1e26c0ed084f (patch)
treebbae8f77dbb2ac85053f1253ab518c3076e0d176 /lib
parentadefea0e78a4f05a2c9537e643873ad61fc22a0a (diff)
substance and nanoparticle model creation and predictions
Diffstat (limited to 'lib')
-rw-r--r--lib/algorithm.rb13
-rw-r--r--lib/classification.rb2
-rw-r--r--lib/compound.rb12
-rw-r--r--lib/feature_selection.rb46
-rw-r--r--lib/lazar.rb3
-rw-r--r--lib/model.rb135
-rw-r--r--lib/nanoparticle.rb25
-rw-r--r--lib/regression.rb67
-rw-r--r--lib/similarity.rb15
-rw-r--r--lib/substance.rb63
10 files changed, 265 insertions, 116 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 113f847..0e4b93a 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -2,18 +2,9 @@ module OpenTox
module Algorithm
- # Generic method to execute algorithms
- # Algorithms should:
- # - accept a Compound, an Array of Compounds or a Dataset as first argument
- # - optional parameters as second argument
- # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
- # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
- # @param [Hash] Algorithm parameters
- # @return Algorithm result
- def self.run algorithm, object, parameters=nil
- bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
+ def self.run algorithm, parameters=nil
klass,method = algorithm.split('.')
- parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
+ Object.const_get(klass).send(method,parameters)
end
end
diff --git a/lib/classification.rb b/lib/classification.rb
index 03c32c4..01ba878 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -3,7 +3,7 @@ module OpenTox
class Classification
- def self.weighted_majority_vote substance:, neighbors:
+ def self.weighted_majority_vote descriptors:nil, neighbors:
sims = {}
neighbors.each do |neighbor|
sim = neighbor["similarity"]
diff --git a/lib/compound.rb b/lib/compound.rb
index 4689d7a..4d62c53 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -75,9 +75,9 @@ module OpenTox
fingerprints[type]
end
- def physchem descriptors=PhysChem.openbabel_descriptors
+ def calculated_physchem descriptors=PhysChem.openbabel_descriptors
# TODO: speedup java descriptors
- calculated_ids = physchem_descriptors.keys
+ calculated_ids = descriptors.keys
# BSON::ObjectId instances are not allowed as keys in a BSON document.
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
descs = {}
@@ -90,11 +90,11 @@ module OpenTox
# avoid recalculating Cdk features with multiple values
descs.keys.uniq.each do |k|
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
- physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+ descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
end
end
save
- physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+ descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
end
def smarts_match smarts, count=false
@@ -254,6 +254,7 @@ module OpenTox
self["chemblid"]
end
+=begin
def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:)
neighbors = []
dataset = Dataset.find(dataset_id)
@@ -276,6 +277,7 @@ module OpenTox
end
neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]}
end
+=end
# def physchem_neighbors params
# # TODO: fix, tests
@@ -340,7 +342,7 @@ module OpenTox
# @return [Float] molecular weight
def molecular_weight
mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
- physchem([mw_feature])[mw_feature.id.to_s]
+ calculated_physchem([mw_feature])[mw_feature.id.to_s]
end
private
diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb
new file mode 100644
index 0000000..43e3bea
--- /dev/null
+++ b/lib/feature_selection.rb
@@ -0,0 +1,46 @@
+module OpenTox
+ module Algorithm
+
+ class FeatureSelection
+
+ def self.correlation_filter dataset:, prediction_feature:, types:nil
+ # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
+ relevant_features = {}
+ measurements = []
+ substances = []
+ dataset.substances.each do |s|
+ dataset.values(s,prediction_feature).each do |act|
+ measurements << act
+ substances << s
+ end
+ end
+ R.assign "tox", measurements
+ feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq
+ feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types
+ feature_ids.each do |feature_id|
+ feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]}
+ unless feature_values.uniq.size == 1
+ R.assign "feature", feature_values
+ begin
+ R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ r = R.eval("cor$estimate").to_ruby
+ relevant_features[feature_id] = {}
+ relevant_features[feature_id]["pvalue"] = pvalue
+ relevant_features[feature_id]["r"] = r
+ relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
+ relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
+ end
+ rescue
+ warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
+ end
+ end
+ end
+ relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+ end
+
+ end
+
+ end
+end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 46605d3..d0f05c0 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -78,7 +78,8 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","
"nanoparticle.rb",
"dataset.rb",
"algorithm.rb",
- "similarity",
+ "similarity.rb",
+ "feature_selection.rb",
"model.rb",
"classification.rb",
"regression.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 749611e..a272580 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -28,101 +28,91 @@ module OpenTox
when /Regression/
model = LazarRegression.new
end
+
# guess model type
elsif prediction_feature.numeric?
model = LazarRegression.new
else
model = LazarClassification.new
end
+
# set defaults
- if model.class == LazarClassification
+ substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
+ bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
+
+ if substance_classes.first == "OpenTox::Compound"
+
model.algorithms = {
+ :descriptors => {
+ :method => "fingerprint",
+ :type => 'MP2D',
+ },
:similarity => {
- :descriptors => "fingerprint['MP2D']",
:method => "Algorithm::Similarity.tanimoto",
:min => 0.1
},
- :prediction => {
- :descriptors => "fingerprint['MP2D']",
- :method => "Algorithm::Classification.weighted_majority_vote",
- },
- :feature_selection => nil,
+ :feature_selection => nil
}
- elsif model.class == LazarRegression
+
+ if model.class == LazarClassification
+ model.algorithms[:prediction] = {
+ :method => "Algorithm::Classification.weighted_majority_vote",
+ }
+ elsif model.class == LazarRegression
+ model.algorithms[:prediction] = {
+ :method => "Algorithm::Regression.caret",
+ :parameters => "pls",
+ }
+ end
+
+ elsif substance_classes.first == "OpenTox::Nanoparticle"
model.algorithms = {
+ :descriptors => {
+ :method => "properties",
+ #:types => ["P-CHEM","Proteomics"],
+ :types => ["P-CHEM"],
+ },
:similarity => {
- :descriptors => "fingerprint['MP2D']",
- :method => "Algorithm::Similarity.tanimoto",
- :min => 0.1
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
},
:prediction => {
- :descriptors => "fingerprint['MP2D']",
- :method => "Algorithm::Regression.local_caret",
- :parameters => "pls",
+ :method => "Algorithm::Regression.caret",
+ :parameters => "rf",
+ },
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
},
- :feature_selection => nil,
}
+ else
+ bad_request_error "Cannot create models for #{substance_classes.first}."
end
- # overwrite defaults
+ # overwrite defaults with explicit parameters
algorithms.each do |type,parameters|
- parameters.each do |p,v|
- model.algorithms[type][p] = v
- end if parameters
+ if parameters and parameters.is_a? Hash
+ parameters.each do |p,v|
+ model.algorithms[type] ||= {}
+ model.algorithms[type][p] = v
+ end
+ else
+ model.algorithms[type] = parameters
+ end
end
- # set defaults for empty parameters
model.prediction_feature_id = prediction_feature.id
model.training_dataset_id = training_dataset.id
model.name = "#{training_dataset.name} #{prediction_feature.name}"
- #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm
+ if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
+ model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types]
+ end
model.save
- p model
model
end
- def correlation_filter
- # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
- self.relevant_features = {}
- measurements = []
- substances = []
- training_dataset.substances.each do |s|
- training_dataset.values(s,prediction_feature_id).each do |act|
- measurements << act
- substances << s
- end
- end
- R.assign "tox", measurements
- feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
- feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category]
- feature_ids.each do |feature_id|
- feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]}
- unless feature_values.uniq.size == 1
- R.assign "feature", feature_values
- begin
- R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
- pvalue = R.eval("cor$p.value").to_ruby
- if pvalue <= 0.05
- r = R.eval("cor$estimate").to_ruby
- self.relevant_features[feature_id] = {}
- self.relevant_features[feature_id]["pvalue"] = pvalue
- self.relevant_features[feature_id]["r"] = r
- self.relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
- self.relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
- end
- rescue
- warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
- end
- end
- end
- self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
- end
-
def predict_substance substance
- neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols
- neighbor_algorithm_parameters[:relevant_features] = self.relevant_features if self.relevant_features
- neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters)
+ neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features
measurements = nil
prediction = {}
# handle query substance
@@ -153,9 +143,17 @@ module OpenTox
prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value
else
# call prediction algorithm
- klass,method = prediction_algorithm.split('.')
- params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors})
- result = Object.const_get(klass).send(method,params)
+ case algorithms[:descriptors][:method]
+ when "fingerprint"
+ descriptors = substance.fingerprints[algorithms[:descriptors][:type]]
+ when "properties"
+ descriptors = substance.properties
+ else
+ bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available."
+ end
+ params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors})
+ params.delete :method
+ result = Algorithm.run algorithms[:prediction][:method], params
prediction.merge! result
prediction[:neighbors] = neighbors
prediction[:neighbors] ||= []
@@ -176,7 +174,7 @@ module OpenTox
elsif object.is_a? Dataset
substances = object.substances
else
- bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
end
# make predictions
@@ -194,7 +192,6 @@ module OpenTox
elsif object.is_a? Array
return predictions
elsif object.is_a? Dataset
- #predictions.each{|cid,p| p.delete(:neighbors)}
# prepare prediction dataset
measurement_feature = Feature.find prediction_feature_id
@@ -205,8 +202,6 @@ module OpenTox
:prediction_feature_id => prediction_feature.id,
:predictions => predictions
)
-
- #prediction_dataset.save
return prediction_dataset
end
@@ -314,7 +309,7 @@ module OpenTox
:feature_selection_algorithm_parameters => {:category => category},
:neighbor_algorithm => "physchem_neighbors",
:neighbor_algorithm_parameters => {:min_sim => 0.5},
- :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression",
+ :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression",
:prediction_algorithm_parameters => {:method => 'rf'}, # random forests
}
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index b1a3835..6905f6f 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -5,10 +5,10 @@ module OpenTox
field :core, type: Hash, default: {}
field :coating, type: Array, default: []
- #field :proteomics, type: Hash, default: {}
attr_accessor :scaled_values
+=begin
def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features:
dataset = Dataset.find(dataset_id)
#relevant_features = {}
@@ -27,12 +27,12 @@ module OpenTox
substances.each do |substance|
values = dataset.values(substance,prediction_feature_id)
if values
- common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys
+ common_descriptors = relevant_features.keys & substance.descriptors.keys
# scale values
- query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
- @scaled_values = common_descriptors.collect{|d| [d,(physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h
- neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
- neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h
+ query_descriptors = common_descriptors.collect{|d| (descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
+ @scaled_values = common_descriptors.collect{|d| [d,(descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h
+ neighbor_descriptors = common_descriptors.collect{|d| (substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
+ neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h
#weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]}
weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2}
sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights)
@@ -54,18 +54,19 @@ module OpenTox
neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
neighbors
end
+=end
def add_feature feature, value, dataset
unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
case feature.category
when "P-CHEM"
- physchem_descriptors[feature.id.to_s] ||= []
- physchem_descriptors[feature.id.to_s] << value
- physchem_descriptors[feature.id.to_s].uniq!
+ properties[feature.id.to_s] ||= []
+ properties[feature.id.to_s] << value
+ properties[feature.id.to_s].uniq!
when "Proteomics"
- physchem_descriptors[feature.id.to_s] ||= []
- physchem_descriptors[feature.id.to_s] << value
- physchem_descriptors[feature.id.to_s].uniq!
+ properties[feature.id.to_s] ||= []
+ properties[feature.id.to_s] << value
+ properties[feature.id.to_s].uniq!
when "TOX"
dataset.add self, feature, value
else
diff --git a/lib/regression.rb b/lib/regression.rb
index 269a743..396c9e4 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,8 @@ module OpenTox
class Regression
- def self.local_weighted_average substance:, neighbors:
+ def self.weighted_average descriptors:nil, neighbors:, parameters:nil
+ # TODO: prediction_interval
weighted_sum = 0.0
sim_sum = 0.0
neighbors.each do |neighbor|
@@ -18,7 +19,57 @@ module OpenTox
{:value => prediction}
end
- def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
+ def self.caret descriptors:, neighbors:, method: "pls", parameters:nil
+ values = []
+ descriptors = {}
+ weights = []
+ descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort
+
+ neighbors.each do |n|
+ activities = n["measurements"]
+ activities.each do |act|
+ values << act
+ weights << n["similarity"]
+ descriptor_ids.each do |id|
+ descriptors[id] ||= []
+ descriptors[id] << n["descriptors"].include?(id)
+ end
+ end if activities
+ end
+
+ variables = []
+ data_frame = [values]
+
+ descriptors.each do |k,v|
+ unless v.uniq.size == 1
+ data_frame << v.collect{|m| m ? "T" : "F"}
+ variables << k
+ end
+ end
+
+ if variables.empty?
+ prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
+ prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ prediction
+ else
+ substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"}
+ #puts data_frame.to_yaml
+ prediction = r_model_prediction method, data_frame, variables, weights, substance_features
+ if prediction.nil? or prediction[:value].nil?
+ prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
+ prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
+ prediction
+ else
+ prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
+ prediction[:value] = prediction[:value]
+ prediction[:rmse] = prediction[:rmse]
+ prediction
+ end
+ end
+
+ end
+
+ def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
values = []
fingerprints = {}
weights = []
@@ -48,14 +99,14 @@ module OpenTox
end
if variables.empty?
- prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+ prediction = weighted_average(substance: substance, neighbors: neighbors)
prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
prediction
else
substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"}
prediction = r_model_prediction method, data_frame, variables, weights, substance_features
if prediction.nil? or prediction[:value].nil?
- prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+ prediction = weighted_average(substance: substance, neighbors: neighbors)
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
prediction
else
@@ -68,7 +119,8 @@ module OpenTox
end
- def self.local_physchem_regression substance:, neighbors:, method: "pls"
+=begin
+ def self.physchem_regression substance:, neighbors:, method: "pls"
activities = []
weights = []
@@ -104,7 +156,7 @@ module OpenTox
pc_ids.compact!
if pc_ids.empty?
- prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+ prediction = weighted_average(substance: substance, neighbors: neighbors)
prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
prediction
else
@@ -122,7 +174,7 @@ module OpenTox
pc_ids.compact!
prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
if prediction.nil?
- prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+ prediction = weighted_average(substance: substance, neighbors: neighbors)
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
end
p prediction
@@ -130,6 +182,7 @@ module OpenTox
end
end
+=end
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
R.assign "weights", training_weights
diff --git a/lib/similarity.rb b/lib/similarity.rb
index 00179c1..b9b4571 100644
--- a/lib/similarity.rb
+++ b/lib/similarity.rb
@@ -15,21 +15,22 @@ module OpenTox
class Similarity
- def self.tanimoto a, b
- ( a & b).size/(a|b).size.to_f
+ def self.tanimoto fingerprints
+ ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
end
- def self.euclid a, b
- sq = a.zip(b).map{|a,b| (a - b) ** 2}
+ def self.euclid fingerprints
+ sq = fingerprints[0].zip(fingerprints[1]).map{|a,b| (a - b) ** 2}
Math.sqrt(sq.inject(0) {|s,c| s + c})
end
# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
- def self.cosine a, b
- Algorithm::Vector.dot_product(a, b) / (Algorithm::Vector.magnitude(a) * Algorithm::Vector.magnitude(b))
+ def self.cosine fingerprints
+ Algorithm::Vector.dot_product(fingerprints[0], fingerprints[1]) / (Algorithm::Vector.magnitude(fingerprints[0]) * Algorithm::Vector.magnitude(fingerprints[1]))
end
- def self.weighted_cosine(a, b, w)
+ def self.weighted_cosine fingerprints # [a,b,weights]
+ a, b, w = fingerprints
dot_product = 0
magnitude_a = 0
magnitude_b = 0
diff --git a/lib/substance.rb b/lib/substance.rb
index 6768ce7..d271327 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,9 +1,68 @@
module OpenTox
class Substance
- field :physchem_descriptors, type: Hash, default: {}
+ field :properties, type: Hash, default: {}
field :dataset_ids, type: Array, default: []
end
-end
+ def neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:nil
+ # TODO enable empty dataset_id -> use complete db
+ case descriptors[:method]
+ when "fingerprint"
+ fingerprint_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity
+ when "properties"
+ properties_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity, relevant_features: relevant_features
+ else
+ bad_request_error "Descriptor method '#{descriptors[:method]}' not implemented."
+ end
+ end
+
+ def fingerprint_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:
+ neighbors = []
+ dataset = Dataset.find(dataset_id)
+ dataset.substances.each do |substance|
+ values = dataset.values(substance,prediction_feature_id)
+ if values
+ query_descriptors = self.send(descriptors[:method].to_sym, descriptors[:type])
+ candidate_descriptors = substance.send(descriptors[:method].to_sym, descriptors[:type])
+ sim = Algorithm.run similarity[:method], [query_descriptors, candidate_descriptors]
+ neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min]
+ end
+ end
+ neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]}
+ end
+ def properties_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:
+ neighbors = []
+ dataset = Dataset.find(dataset_id)
+ weights = relevant_features.collect{|k,v| v["r"]**2}
+ means = relevant_features.collect{|k,v| v["mean"]}
+ standard_deviations = relevant_features.collect{|k,v| v["sd"]}
+ query_descriptors = relevant_features.keys.collect{|i| properties[i].is_a?(Array) ? properties[i].median : nil }
+ dataset.substances.each do |substance|
+ values = dataset.values(substance,prediction_feature_id)
+ # exclude nanoparticles with different core
+ # TODO validate exclusion
+ next if substance.is_a? Nanoparticle and substance.core != self.core
+ if values
+ candidate_descriptors = relevant_features.keys.collect{|i| substance.properties[i].is_a?(Array) ? substance.properties[i].median : nil }
+ q = []
+ c = []
+ w = []
+ (0..relevant_features.size-1).each do |i|
+ # add only complete pairs
+ if query_descriptors[i] and candidate_descriptors[i]
+ w << weights[i]
+ # scale values
+ q << (query_descriptors[i] - means[i])/standard_deviations[i]
+ c << (candidate_descriptors[i] - means[i])/standard_deviations[i]
+ end
+ end
+ sim = Algorithm.run similarity[:method], [q, c, w]
+ neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min]
+ end
+ end
+ neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]}
+ end
+
+end