From 5d4e5e463c2b87241bbb56e4658e1e26c0ed084f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 5 Oct 2016 13:22:12 +0200 Subject: substance and nanoparticle model creation and predictions --- lib/algorithm.rb | 13 +---- lib/classification.rb | 2 +- lib/compound.rb | 12 +++-- lib/feature_selection.rb | 46 ++++++++++++++++ lib/lazar.rb | 3 +- lib/model.rb | 135 +++++++++++++++++++++++------------------------ lib/nanoparticle.rb | 25 ++++----- lib/regression.rb | 67 ++++++++++++++++++++--- lib/similarity.rb | 15 +++--- lib/substance.rb | 63 +++++++++++++++++++++- 10 files changed, 265 insertions(+), 116 deletions(-) create mode 100644 lib/feature_selection.rb (limited to 'lib') diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 113f847..0e4b93a 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -2,18 +2,9 @@ module OpenTox module Algorithm - # Generic method to execute algorithms - # Algorithms should: - # - accept a Compound, an Array of Compounds or a Dataset as first argument - # - optional parameters as second argument - # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values - # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object - # @param [Hash] Algorithm parameters - # @return Algorithm result - def self.run algorithm, object, parameters=nil - bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/ + def self.run algorithm, parameters=nil klass,method = algorithm.split('.') - parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) + Object.const_get(klass).send(method,parameters) end end diff --git a/lib/classification.rb b/lib/classification.rb index 03c32c4..01ba878 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,7 +3,7 @@ module OpenTox class Classification - def self.weighted_majority_vote substance:, neighbors: + def self.weighted_majority_vote descriptors:nil, neighbors: sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] diff --git a/lib/compound.rb b/lib/compound.rb index 4689d7a..4d62c53 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -75,9 +75,9 @@ module OpenTox fingerprints[type] end - def physchem descriptors=PhysChem.openbabel_descriptors + def calculated_physchem descriptors=PhysChem.openbabel_descriptors # TODO: speedup java descriptors - calculated_ids = physchem_descriptors.keys + calculated_ids = descriptors.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -90,11 +90,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false @@ -254,6 +254,7 @@ module OpenTox self["chemblid"] end +=begin def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] dataset = Dataset.find(dataset_id) @@ -276,6 +277,7 @@ module OpenTox end neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} end +=end # def physchem_neighbors params # # TODO: fix, tests @@ -340,7 +342,7 @@ module OpenTox # @return [Float] molecular weight def molecular_weight mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - physchem([mw_feature])[mw_feature.id.to_s] + calculated_physchem([mw_feature])[mw_feature.id.to_s] end private diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb new file mode 100644 index 0000000..43e3bea --- /dev/null +++ b/lib/feature_selection.rb @@ -0,0 +1,46 @@ +module OpenTox + module Algorithm + + class FeatureSelection + + def self.correlation_filter dataset:, prediction_feature:, types:nil + # TODO: speedup, single assignment of all features to R+ parallel computation of significance? + relevant_features = {} + measurements = [] + substances = [] + dataset.substances.each do |s| + dataset.values(s,prediction_feature).each do |act| + measurements << act + substances << s + end + end + R.assign "tox", measurements + feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq + feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]} + unless feature_values.uniq.size == 1 + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature_id] = {} + relevant_features[feature_id]["pvalue"] = pvalue + relevant_features[feature_id]["r"] = r + relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." + end + end + end + relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + end + + end + + end +end diff --git a/lib/lazar.rb b/lib/lazar.rb index 46605d3..d0f05c0 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -78,7 +78,8 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation"," "nanoparticle.rb", "dataset.rb", "algorithm.rb", - "similarity", + "similarity.rb", + "feature_selection.rb", "model.rb", "classification.rb", "regression.rb", diff --git a/lib/model.rb b/lib/model.rb index 749611e..a272580 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,101 +28,91 @@ module OpenTox when /Regression/ model = LazarRegression.new end + # guess model type elsif prediction_feature.numeric? model = LazarRegression.new else model = LazarClassification.new end + # set defaults - if model.class == LazarClassification + substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq + bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 + + if substance_classes.first == "OpenTox::Compound" + model.algorithms = { + :descriptors => { + :method => "fingerprint", + :type => 'MP2D', + }, :similarity => { - :descriptors => "fingerprint['MP2D']", :method => "Algorithm::Similarity.tanimoto", :min => 0.1 }, - :prediction => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Classification.weighted_majority_vote", - }, - :feature_selection => nil, + :feature_selection => nil } - elsif model.class == LazarRegression + + if model.class == LazarClassification + model.algorithms[:prediction] = { + :method => "Algorithm::Classification.weighted_majority_vote", + } + elsif model.class == LazarRegression + model.algorithms[:prediction] = { + :method => "Algorithm::Regression.caret", + :parameters => "pls", + } + end + + elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { + :descriptors => { + :method => "properties", + #:types => ["P-CHEM","Proteomics"], + :types => ["P-CHEM"], + }, :similarity => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 }, :prediction => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Regression.local_caret", - :parameters => "pls", + :method => "Algorithm::Regression.caret", + :parameters => "rf", + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", }, - :feature_selection => nil, } + else + bad_request_error "Cannot create models for #{substance_classes.first}." end - # overwrite defaults + # overwrite defaults with explicit parameters algorithms.each do |type,parameters| - parameters.each do |p,v| - model.algorithms[type][p] = v - end if parameters + if parameters and parameters.is_a? Hash + parameters.each do |p,v| + model.algorithms[type] ||= {} + model.algorithms[type][p] = v + end + else + model.algorithms[type] = parameters + end end - # set defaults for empty parameters model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = "#{training_dataset.name} #{prediction_feature.name}" - #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] + model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] + end model.save - p model model end - def correlation_filter - # TODO: speedup, single assignment of all features to R+ parallel computation of significance? - self.relevant_features = {} - measurements = [] - substances = [] - training_dataset.substances.each do |s| - training_dataset.values(s,prediction_feature_id).each do |act| - measurements << act - substances << s - end - end - R.assign "tox", measurements - feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq - feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category] - feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - unless feature_values.uniq.size == 1 - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - self.relevant_features[feature_id] = {} - self.relevant_features[feature_id]["pvalue"] = pvalue - self.relevant_features[feature_id]["r"] = r - self.relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - self.relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby - end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." - end - end - end - self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h - end - def predict_substance substance - neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols - neighbor_algorithm_parameters[:relevant_features] = self.relevant_features if self.relevant_features - neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) + neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features measurements = nil prediction = {} # handle query substance @@ -153,9 +143,17 @@ module OpenTox prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value else # call prediction algorithm - klass,method = prediction_algorithm.split('.') - params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors}) - result = Object.const_get(klass).send(method,params) + case algorithms[:descriptors][:method] + when "fingerprint" + descriptors = substance.fingerprints[algorithms[:descriptors][:type]] + when "properties" + descriptors = substance.properties + else + bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." + end + params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors}) + params.delete :method + result = Algorithm.run algorithms[:prediction][:method], params prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] @@ -176,7 +174,7 @@ module OpenTox elsif object.is_a? Dataset substances = object.substances else - bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions @@ -194,7 +192,6 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - #predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id @@ -205,8 +202,6 @@ module OpenTox :prediction_feature_id => prediction_feature.id, :predictions => predictions ) - - #prediction_dataset.save return prediction_dataset end @@ -314,7 +309,7 @@ module OpenTox :feature_selection_algorithm_parameters => {:category => category}, :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}, - :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", + :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression", :prediction_algorithm_parameters => {:method => 'rf'}, # random forests } training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index b1a3835..6905f6f 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -5,10 +5,10 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] - #field :proteomics, type: Hash, default: {} attr_accessor :scaled_values +=begin def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features: dataset = Dataset.find(dataset_id) #relevant_features = {} @@ -27,12 +27,12 @@ module OpenTox substances.each do |substance| values = dataset.values(substance,prediction_feature_id) if values - common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys + common_descriptors = relevant_features.keys & substance.descriptors.keys # scale values - query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - @scaled_values = common_descriptors.collect{|d| [d,(physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h + query_descriptors = common_descriptors.collect{|d| (descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + @scaled_values = common_descriptors.collect{|d| [d,(descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h + neighbor_descriptors = common_descriptors.collect{|d| (substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) @@ -54,18 +54,19 @@ module OpenTox neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} neighbors end +=end def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand case feature.category when "P-CHEM" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! + properties[feature.id.to_s] ||= [] + properties[feature.id.to_s] << value + properties[feature.id.to_s].uniq! when "Proteomics" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! + properties[feature.id.to_s] ||= [] + properties[feature.id.to_s] << value + properties[feature.id.to_s].uniq! when "TOX" dataset.add self, feature, value else diff --git a/lib/regression.rb b/lib/regression.rb index 269a743..396c9e4 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,8 @@ module OpenTox class Regression - def self.local_weighted_average substance:, neighbors: + def self.weighted_average descriptors:nil, neighbors:, parameters:nil + # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 neighbors.each do |neighbor| @@ -18,7 +19,57 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" + def self.caret descriptors:, neighbors:, method: "pls", parameters:nil + values = [] + descriptors = {} + weights = [] + descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort + + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| + values << act + weights << n["similarity"] + descriptor_ids.each do |id| + descriptors[id] ||= [] + descriptors[id] << n["descriptors"].include?(id) + end + end if activities + end + + variables = [] + data_frame = [values] + + descriptors.each do |k,v| + unless v.uniq.size == 1 + data_frame << v.collect{|m| m ? "T" : "F"} + variables << k + end + end + + if variables.empty? + prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction + else + substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} + #puts data_frame.to_yaml + prediction = r_model_prediction method, data_frame, variables, weights, substance_features + if prediction.nil? or prediction[:value].nil? + prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) + prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + prediction + else + prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] + prediction[:value] = prediction[:value] + prediction[:rmse] = prediction[:rmse] + prediction + end + end + + end + + def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -48,14 +99,14 @@ module OpenTox end if variables.empty? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." prediction else @@ -68,7 +119,8 @@ module OpenTox end - def self.local_physchem_regression substance:, neighbors:, method: "pls" +=begin + def self.physchem_regression substance:, neighbors:, method: "pls" activities = [] weights = [] @@ -104,7 +156,7 @@ module OpenTox pc_ids.compact! if pc_ids.empty? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else @@ -122,7 +174,7 @@ module OpenTox pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." end p prediction @@ -130,6 +182,7 @@ module OpenTox end end +=end def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights diff --git a/lib/similarity.rb b/lib/similarity.rb index 00179c1..b9b4571 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -15,21 +15,22 @@ module OpenTox class Similarity - def self.tanimoto a, b - ( a & b).size/(a|b).size.to_f + def self.tanimoto fingerprints + ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end - def self.euclid a, b - sq = a.zip(b).map{|a,b| (a - b) ** 2} + def self.euclid fingerprints + sq = fingerprints[0].zip(fingerprints[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) end # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity - def self.cosine a, b - Algorithm::Vector.dot_product(a, b) / (Algorithm::Vector.magnitude(a) * Algorithm::Vector.magnitude(b)) + def self.cosine fingerprints + Algorithm::Vector.dot_product(fingerprints[0], fingerprints[1]) / (Algorithm::Vector.magnitude(fingerprints[0]) * Algorithm::Vector.magnitude(fingerprints[1])) end - def self.weighted_cosine(a, b, w) + def self.weighted_cosine fingerprints # [a,b,weights] + a, b, w = fingerprints dot_product = 0 magnitude_a = 0 magnitude_b = 0 diff --git a/lib/substance.rb b/lib/substance.rb index 6768ce7..d271327 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,9 +1,68 @@ module OpenTox class Substance - field :physchem_descriptors, type: Hash, default: {} + field :properties, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end -end + def neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:nil + # TODO enable empty dataset_id -> use complete db + case descriptors[:method] + when "fingerprint" + fingerprint_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity + when "properties" + properties_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity, relevant_features: relevant_features + else + bad_request_error "Descriptor method '#{descriptors[:method]}' not implemented." + end + end + + def fingerprint_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity: + neighbors = [] + dataset = Dataset.find(dataset_id) + dataset.substances.each do |substance| + values = dataset.values(substance,prediction_feature_id) + if values + query_descriptors = self.send(descriptors[:method].to_sym, descriptors[:type]) + candidate_descriptors = substance.send(descriptors[:method].to_sym, descriptors[:type]) + sim = Algorithm.run similarity[:method], [query_descriptors, candidate_descriptors] + neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] + end + end + neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} + end + def properties_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features: + neighbors = [] + dataset = Dataset.find(dataset_id) + weights = relevant_features.collect{|k,v| v["r"]**2} + means = relevant_features.collect{|k,v| v["mean"]} + standard_deviations = relevant_features.collect{|k,v| v["sd"]} + query_descriptors = relevant_features.keys.collect{|i| properties[i].is_a?(Array) ? properties[i].median : nil } + dataset.substances.each do |substance| + values = dataset.values(substance,prediction_feature_id) + # exclude nanoparticles with different core + # TODO validate exclusion + next if substance.is_a? Nanoparticle and substance.core != self.core + if values + candidate_descriptors = relevant_features.keys.collect{|i| substance.properties[i].is_a?(Array) ? substance.properties[i].median : nil } + q = [] + c = [] + w = [] + (0..relevant_features.size-1).each do |i| + # add only complete pairs + if query_descriptors[i] and candidate_descriptors[i] + w << weights[i] + # scale values + q << (query_descriptors[i] - means[i])/standard_deviations[i] + c << (candidate_descriptors[i] - means[i])/standard_deviations[i] + end + end + sim = Algorithm.run similarity[:method], [q, c, w] + neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] + end + end + neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} + end + +end -- cgit v1.2.3