From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 12 Oct 2016 21:32:27 +0200 Subject: physchem regression, correlation_filter for fingerprints --- lib/caret.rb | 184 ++++++++++++++----------------------------- lib/classification.rb | 23 ++---- lib/compound.rb | 48 ++---------- lib/feature_selection.rb | 60 +++++++-------- lib/model.rb | 197 ++++++++++++++++++++++++++++++++--------------- lib/overwrite.rb | 13 +++- lib/physchem.rb | 14 ++-- lib/regression.rb | 15 ++-- lib/similarity.rb | 25 ++++-- lib/substance.rb | 60 --------------- test/model.rb | 41 ++++------ test/regression.rb | 37 ++++++++- 12 files changed, 328 insertions(+), 389 deletions(-) diff --git a/lib/caret.rb b/lib/caret.rb index b999b06..59e02da 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -5,33 +5,56 @@ module OpenTox # TODO classification # model list: https://topepo.github.io/caret/modelList.html - attr_accessor :descriptors, :neighbors, :method, :relevant_features, :data_frame, :feature_names, :weights, :query_features - - def initialize descriptors:, neighbors:, method:, relevant_features: - @descriptors = descriptors - @neighbors = neighbors - @method = method - @relevant_features = relevant_features - end - - def self.regression descriptors:, neighbors:, method:, relevant_features:nil - - caret = new(descriptors:descriptors, neighbors:neighbors, method:method, relevant_features:relevant_features) - # collect training data for R - if descriptors.is_a? Array - caret.fingerprint2R - elsif descriptors.is_a? Hash - caret.properties2R - else - bad_request_error "Descriptors should be a fingerprint (Array) or properties (Hash). Cannot handle '#{descriptors.class}'." - end - if caret.feature_names.empty? or caret.data_frame.flatten.uniq == ["NA"] - prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables: + if independent_variables.flatten.uniq == ["NA"] + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." else - prediction = caret.r_model_prediction + dependent_variables.each_with_index do |v,i| + dependent_variables[i] = to_r(v) + end + independent_variables.each_with_index do |c,i| + c.each_with_index do |v,j| + independent_variables[i][j] = to_r(v) + end + end + query_variables.each_with_index do |v,i| + query_variables[i] = to_r(v) + end + begin + R.assign "weights", weights + r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", (0..independent_variables.size-1).to_a + R.eval "names(data) <- append(c('activities'),features)" # + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" + rescue => e + $logger.debug "R caret model creation error for:" + $logger.debug JSON.pretty_generate(dependent_variables) + $logger.debug JSON.pretty_generate(independent_variables) + return {:value => nil, :warning => "R caret model cration error."} + end + begin + R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))" + R.eval "names(query) <- features" + R.eval "prediction <- predict(model,query)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse + prediction = { + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval + } + rescue => e + $logger.debug "R caret prediction error for:" + $logger.debug self.inspect + return nil + end if prediction.nil? or prediction[:value].nil? - prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." end end @@ -39,111 +62,18 @@ module OpenTox end - def fingerprint2R - - values = [] - features = {} - @weights = [] - descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort - - neighbors.each do |n| - activities = n["measurements"] - activities.each do |act| - values << act - @weights << n["similarity"] - descriptor_ids.each do |id| - features[id] ||= [] - features[id] << n["descriptors"].include?(id) - end - end if activities - end - - @feature_names = [] - @data_frame = [values] - - features.each do |k,v| - unless v.uniq.size == 1 - @data_frame << v.collect{|m| m ? "T" : "F"} - @feature_names << k - end - end - @query_features = @feature_names.collect{|f| descriptors.include?(f) ? "T" : "F"} - + # call caret methods dynamically, e.g. Caret.pls + def self.method_missing(sym, *args, &block) + args.first[:method] = sym.to_s + self.create_model_and_predict args.first end - - def properties2R - - @weights = [] - @feature_names = [] - @query_features = [] - - # keep only descriptors with values - @relevant_features.keys.each_with_index do |f,i| - if @descriptors[f] - @feature_names << f - @query_features << @descriptors[f].median - else - neighbors.each do |n| - n["descriptors"].delete_at i - end - end - end - - measurements = neighbors.collect{|n| n["measurements"]}.flatten - # initialize data frame with 'NA' defaults - @data_frame = Array.new(@feature_names.size+1){Array.new(measurements.size,"NA") } - - i = 0 - # parse neighbor activities and descriptors - neighbors.each do |n| - activities = n["measurements"] - activities.each do |act| # multiple measurements are treated as separate instances - unless n["descriptors"].include?(nil) - data_frame[0][i] = act - @weights << n["similarity"] - n["descriptors"].each_with_index do |d,j| - @data_frame[j+1][i] = d - end - i += 1 - end - end if activities # ignore neighbors without measurements - end - - end - - def r_model_prediction - begin - R.assign "weights", @weights - r_data_frame = "data.frame(#{@data_frame.collect{|r| "c(#{r.join(',')})"}.join(', ')})" - R.eval "data <- #{r_data_frame}" - R.assign "features", @feature_names - R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" - rescue => e - $logger.debug "R caret model creation error for:" - $logger.debug JSON.pretty_generate(self.inspect) - return nil - end - begin - R.eval "query <- data.frame(rbind(c(#{@query_features.join ','})))" - R.eval "names(query) <- features" - R.eval "prediction <- predict(model,query)" - value = R.eval("prediction").to_f - rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f - r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f - prediction_interval = value-1.96*rmse, value+1.96*rmse - { - :value => value, - :rmse => rmse, - :r_squared => r_squared, - :prediction_interval => prediction_interval - } - rescue => e - $logger.debug "R caret prediction error for:" - $logger.debug self.inspect - return nil - end + def self.to_r v + return "F" if v == false + return "T" if v == true + return "NA" if v.nil? + return "NA" if v.is_a? Float and v.nan? + v end end diff --git a/lib/classification.rb b/lib/classification.rb index 6582e7d..e8c179f 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,24 +3,17 @@ module OpenTox class Classification - def self.weighted_majority_vote descriptors:nil, neighbors:, method:nil, relevant_features:nil - sims = {} - neighbors.each do |neighbor| - sim = neighbor["similarity"] - activities = neighbor["measurements"] - activities.each do |act| - sims[act] ||= [] - sims[act] << sim - end if activities + def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables: + class_weights = {} + dependent_variables.each_with_index do |v,i| + class_weights[v] ||= [] + class_weights[v] << weights[i] unless v.nil? end - sim_all = sims.collect{|a,s| s}.flatten - sim_sum = sim_all.sum - sim_max = sim_all.max probabilities = {} - sims.each do |a,s| - probabilities[a] = s.sum/sim_sum + class_weights.each do |a,w| + probabilities[a] = w.sum/weights.sum end - probabilities = probabilities.collect{|a,p| [a,sim_max*p]}.to_h + probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) {:value => prediction,:probabilities => probabilities} diff --git a/lib/compound.rb b/lib/compound.rb index 93cfc03..0f178ce 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -75,7 +75,11 @@ module OpenTox fingerprints[type] end - def calculated_physchem descriptors=PhysChem.openbabel_descriptors + def calculated_properties types=["OPENBABEL"] + descriptors = [] + types.each do |t| + descriptors += PhysChem.descriptors OpenTox.const_get(t) + end # TODO: speedup java descriptors calculated_ids = properties.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. @@ -254,48 +258,6 @@ module OpenTox self["chemblid"] end -=begin - def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) - neighbors = [] - dataset = Dataset.find(dataset_id) - # TODO: fix db_neighbors -# if type == DEFAULT_FINGERPRINT -# neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) -# neighbors.each do |n| -# n["measurements"] = dataset.values(n["_id"],prediction_feature_id) -# end -# else - query_fingerprint = self.fingerprint type - dataset.compounds.each do |compound| - values = dataset.values(compound,prediction_feature_id) - if values - candidate_fingerprint = compound.fingerprint type - sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) - neighbors << {"_id" => compound.id, "measurements" => values, "similarity" => sim} if sim >= min_sim - end -# end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end -=end - -# def physchem_neighbors params -# # TODO: fix, tests -# feature_dataset = Dataset.find params[:feature_dataset_id] -# query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] -# neighbors = [] -# feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| -# # TODO implement pearson and cosine similarity separatly -# R.assign "x", query_fingerprint -# R.assign "y", candidate_fingerprint -# sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first -# if sim >= params[:min_sim] -# neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming -# end -# end -# neighbors -# end - def db_neighbors min_sim: 0.1, dataset_id: p fingerprints[DEFAULT_FINGERPRINT] # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb index 43e3bea..f599539 100644 --- a/lib/feature_selection.rb +++ b/lib/feature_selection.rb @@ -3,41 +3,39 @@ module OpenTox class FeatureSelection - def self.correlation_filter dataset:, prediction_feature:, types:nil - # TODO: speedup, single assignment of all features to R+ parallel computation of significance? + def self.correlation_filter model relevant_features = {} - measurements = [] - substances = [] - dataset.substances.each do |s| - dataset.values(s,prediction_feature).each do |act| - measurements << act - substances << s - end - end - R.assign "tox", measurements - feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq - feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types - feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]} - unless feature_values.uniq.size == 1 - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - relevant_features[feature_id] = {} - relevant_features[feature_id]["pvalue"] = pvalue - relevant_features[feature_id]["r"] = r - relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby - end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." + R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)} + model.descriptor_weights = [] + selected_variables = [] + selected_descriptor_ids = [] + model.independent_variables.each_with_index do |v,i| + R.assign "independent", v.collect{|n| to_r(n)} + begin + R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + model.descriptor_weights << R.eval("cor$estimate").to_ruby**2 + selected_variables << v + selected_descriptor_ids << model.descriptor_ids[i] end + rescue + #warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with '#{Feature.find(model.descriptor_ids[i]).name}' (#{v}) failed." + warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed." end end - relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + + model.independent_variables = selected_variables + model.descriptor_ids = selected_descriptor_ids + model + end + + def self.to_r v + return 0 if v == false + return 1 if v == true + return "NA" if v.nil? + return "NA" if v.is_a? Float and v.nan? + v end end diff --git a/lib/model.rb b/lib/model.rb index 290309a..f3f0603 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -11,10 +11,18 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ + field :algorithms, type: Hash, default:{} field :training_dataset_id, type: BSON::ObjectId + field :substance_ids, type: Array, default:[] field :prediction_feature_id, type: BSON::ObjectId - field :algorithms, type: Hash - field :relevant_features, type: Hash + field :dependent_variables, type: Array, default:[] + field :descriptor_ids, type:Array, default:[] + field :independent_variables, type: Array, default:[] + field :fingerprints, type: Array, default:[] + field :descriptor_weights, type: Array, default:[] + field :descriptor_means, type: Array, default:[] + field :descriptor_sds, type: Array, default:[] + field :scaled_variables, type: Array, default:[] def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset @@ -40,7 +48,7 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id - model.name = "#{training_dataset.name} #{prediction_feature.name}" + model.name = "#{prediction_feature.name} (#{training_dataset.name})" # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq @@ -49,10 +57,7 @@ module OpenTox if substance_classes.first == "OpenTox::Compound" model.algorithms = { - :descriptors => { - :method => "fingerprint", - :type => 'MP2D', - }, + :descriptors => ['MP2D'], :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -66,25 +71,20 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Caret.regression", - :parameters => "pls", + :method => "Algorithm::Caret.pls", } end elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { - :descriptors => { - :method => "properties", - #:types => ["P-CHEM","Proteomics"], - :types => ["P-CHEM"], - }, + :descriptors => ["P-CHEM"], + #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5 }, :prediction => { - :method => "Algorithm::Caret.regression", - :parameters => "rf", + :method => "Algorithm::Caret.rf", }, :feature_selection => { :method => "Algorithm::FeatureSelection.correlation_filter", @@ -106,63 +106,128 @@ module OpenTox end end + # parse dependent_variables from training dataset + training_dataset.substances.each do |substance| + values = training_dataset.values(substance,model.prediction_feature_id) + values.each do |v| + model.substance_ids << substance.id.to_s + model.dependent_variables << v + end if values + end + + # parse fingerprints + if model.fingerprints? + model.algorithms[:descriptors].each do |type| + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! + end + end + model.descriptor_ids = model.fingerprints.flatten.uniq + model.descriptor_ids.each do |d| + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} + end + else + # parse independent_variables + if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty? + properties = model.substances.collect { |s| s.properties } + all_property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + + # calculate physchem properties + else + properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) } + model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} + end + end + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] - model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] + model = Algorithm.run model.algorithms[:feature_selection][:method], model + end + + # scale independent_variables + unless model.fingerprints? + model.independent_variables.each_with_index do |var,i| + model.descriptor_means[i] = var.mean + model.descriptor_sds[i] = var.standard_deviation + model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} + end end model.save model end def predict_substance substance - neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features - measurements = nil - prediction = {} - # handle query substance - if neighbors.collect{|n| n["_id"]}.include? substance.id - - query = neighbors.select{|n| n["_id"] == substance.id}.first - measurements = training_dataset.values(query["_id"],prediction_feature_id) - prediction[:measurements] = measurements - prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance." - neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) + + case algorithms[:similarity][:method] + when /tanimoto/ # binary features + similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq + # TODO this excludes descriptors only present in the query substance + query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} + when /euclid|cosine/ # quantitative features + similarity_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + (prop-descriptor_means[i])/descriptor_sds[i] + } + query_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + substance.properties[id] + } + else + bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - if neighbors.empty? - prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) - elsif neighbors.size == 1 - value = nil - m = neighbors.first["measurements"] - if m.size == 1 # single measurement - value = m.first - else # multiple measurement - if m.collect{|t| t.numeric?}.uniq == [true] # numeric - value = m.median - elsif m.uniq.size == 1 # single value - value = m.first - else # contradictory results - # TODO add majority vote?? + + prediction = {} + neighbor_ids = [] + neighbor_similarities = [] + neighbor_dependent_variables = [] + neighbor_independent_variables = [] + + prediction = {} + # find neighbors + substance_ids.each_with_index do |s,i| + # handle query substance + if substance.id.to_s == s + prediction[:measurements] ||= [] + prediction[:measurements] << dependent_variables[i] + prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." + else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core + if fingerprints? + neighbor_descriptors = fingerprints[i] + else + neighbor_descriptors = scaled_variables.collect{|v| v[i]} + end + sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] + if sim > algorithms[:similarity][:min] + neighbor_ids << s + neighbor_similarities << sim + neighbor_dependent_variables << dependent_variables[i] + independent_variables.each_with_index do |c,j| + neighbor_independent_variables[j] ||= [] + neighbor_independent_variables[j] << independent_variables[j][i] + end end end - prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value + end + + measurements = nil + + if neighbor_similarities.empty? + prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbor_similarities.size == 1 + prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else # call prediction algorithm - case algorithms[:descriptors][:method] - when "fingerprint" - descriptors = substance.fingerprints[algorithms[:descriptors][:type]] - when "properties" - descriptors = substance.properties - else - bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." - end - params = { - :method => algorithms[:prediction][:parameters], - :descriptors => descriptors, - :neighbors => neighbors, - :relevant_features => relevant_features - } - result = Algorithm.run algorithms[:prediction][:method], params + result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors + p result prediction.merge! result - prediction[:neighbors] = neighbors - prediction[:neighbors] ||= [] + prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end prediction end @@ -221,6 +286,18 @@ module OpenTox Feature.find(prediction_feature_id) end + def descriptors + descriptor_ids.collect{|id| Feature.find(id)} + end + + def substances + substance_ids.collect{|id| Substance.find(id)} + end + + def fingerprints? + algorithms[:similarity][:method].match("tanimoto") ? true : false + end + end class LazarClassification < Lazar diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 4a79051..d0422ee 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -101,13 +101,13 @@ class Array end def mean - self.inject{ |sum, el| sum + el }.to_f / self.size + self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size end def sample_variance m = self.mean - sum = self.inject(0){|accum, i| accum +(i-m)**2 } - sum/(self.length - 1).to_f + sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 } + sum/(self.compact.length - 1).to_f end def standard_deviation @@ -123,6 +123,13 @@ class Array end end + def collect_with_index + result = [] + self.each_with_index do |elt, idx| + result << yield(elt, idx) + end + result + end end module URI diff --git a/lib/physchem.rb b/lib/physchem.rb index c32e382..327acd8 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -14,7 +14,7 @@ module OpenTox JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] - OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| name,description = d.split(/\s+/,2) ["Openbabel."+name,description] unless obexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] @@ -25,17 +25,17 @@ module OpenTox prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'') d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] } end - CDKDESCRIPTORS = cdkdescriptors + CDK = cdkdescriptors # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] # strip Joelib messages from stdout - JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| name = d[:java_class].sub(/^joelib2.feature.types./,'') ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] - DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB)) require_relative "unique_descriptors.rb" @@ -65,15 +65,15 @@ module OpenTox end def self.openbabel_descriptors - descriptors OBDESCRIPTORS + descriptors OPENBABEL end def self.cdk_descriptors - descriptors CDKDESCRIPTORS + descriptors CDK end def self.joelib_descriptors - descriptors JOELIBDESCRIPTORS + descriptors JOELIB end def calculate compound diff --git a/lib/regression.rb b/lib/regression.rb index 0e5e06b..bed6df8 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,18 +3,15 @@ module OpenTox class Regression - def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil + def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: + #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 - neighbors.each do |neighbor| - sim = neighbor["similarity"] - activities = neighbor["measurements"] - activities.each do |act| - weighted_sum += sim*act - sim_sum += sim - end if activities - end + dependent_variables.each_with_index do |v,i| + weighted_sum += weights[i]*dependent_variables[i] + sim_sum += weights[i] + end if dependent_variables sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end diff --git a/lib/similarity.rb b/lib/similarity.rb index b9b4571..328d42a 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -19,18 +19,19 @@ module OpenTox ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end - def self.euclid fingerprints - sq = fingerprints[0].zip(fingerprints[1]).map{|a,b| (a - b) ** 2} + def self.euclid scaled_properties + sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) end # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity - def self.cosine fingerprints - Algorithm::Vector.dot_product(fingerprints[0], fingerprints[1]) / (Algorithm::Vector.magnitude(fingerprints[0]) * Algorithm::Vector.magnitude(fingerprints[1])) + def self.cosine scaled_properties + scaled_properties = remove_nils scaled_properties + Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1])) end - def self.weighted_cosine fingerprints # [a,b,weights] - a, b, w = fingerprints + def self.weighted_cosine scaled_properties # [a,b,weights] + a,b,w = remove_nils scaled_properties dot_product = 0 magnitude_a = 0 magnitude_b = 0 @@ -42,6 +43,18 @@ module OpenTox dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) end + def self.remove_nils scaled_properties + a =[]; b = []; w = [] + (0..scaled_properties.first.size-1).each do |i| + if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan? + a << scaled_properties[0][i] + b << scaled_properties[1][i] + w << scaled_properties[2][i] + end + end + [a,b,w] + end + end end end diff --git a/lib/substance.rb b/lib/substance.rb index d271327..31c465e 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -5,64 +5,4 @@ module OpenTox field :dataset_ids, type: Array, default: [] end - def neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:nil - # TODO enable empty dataset_id -> use complete db - case descriptors[:method] - when "fingerprint" - fingerprint_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity - when "properties" - properties_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity, relevant_features: relevant_features - else - bad_request_error "Descriptor method '#{descriptors[:method]}' not implemented." - end - end - - def fingerprint_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity: - neighbors = [] - dataset = Dataset.find(dataset_id) - dataset.substances.each do |substance| - values = dataset.values(substance,prediction_feature_id) - if values - query_descriptors = self.send(descriptors[:method].to_sym, descriptors[:type]) - candidate_descriptors = substance.send(descriptors[:method].to_sym, descriptors[:type]) - sim = Algorithm.run similarity[:method], [query_descriptors, candidate_descriptors] - neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] - end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end - - def properties_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features: - neighbors = [] - dataset = Dataset.find(dataset_id) - weights = relevant_features.collect{|k,v| v["r"]**2} - means = relevant_features.collect{|k,v| v["mean"]} - standard_deviations = relevant_features.collect{|k,v| v["sd"]} - query_descriptors = relevant_features.keys.collect{|i| properties[i].is_a?(Array) ? properties[i].median : nil } - dataset.substances.each do |substance| - values = dataset.values(substance,prediction_feature_id) - # exclude nanoparticles with different core - # TODO validate exclusion - next if substance.is_a? Nanoparticle and substance.core != self.core - if values - candidate_descriptors = relevant_features.keys.collect{|i| substance.properties[i].is_a?(Array) ? substance.properties[i].median : nil } - q = [] - c = [] - w = [] - (0..relevant_features.size-1).each do |i| - # add only complete pairs - if query_descriptors[i] and candidate_descriptors[i] - w << weights[i] - # scale values - q << (query_descriptors[i] - means[i])/standard_deviations[i] - c << (candidate_descriptors[i] - means[i])/standard_deviations[i] - end - end - sim = Algorithm.run similarity[:method], [q, c, w] - neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] - end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end - end diff --git a/test/model.rb b/test/model.rb index 02b8e73..9f30928 100644 --- a/test/model.rb +++ b/test/model.rb @@ -4,17 +4,13 @@ class ModelTest < MiniTest::Test def test_default_regression algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D" - }, + :descriptors => [ "MP2D" ], :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 }, :prediction => { - :method => "Algorithm::Caret.regression", - :parameters => "pls", + :method => "Algorithm::Caret.pls", }, :feature_selection => nil, } @@ -29,17 +25,13 @@ class ModelTest < MiniTest::Test def test_regression_parameters algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D" - }, + :descriptors => [ "MP2D" ], :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.3 }, :prediction => { :method => "Algorithm::Regression.weighted_average", - :parameters => "rf", }, :feature_selection => nil, } @@ -57,18 +49,22 @@ class ModelTest < MiniTest::Test def test_physchem_regression algorithms = { - :descriptors => "physchem", + :descriptors => ["PhysChem::OPENBABEL"], :similarity => { - :method => "Algorithm::Similarity.weighted_cosine", + :method => "Algorithm::Similarity.cosine", } } training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method] + assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method] assert_equal 0.1, model.algorithms[:similarity][:min] assert_equal algorithms[:descriptors], model.algorithms[:descriptors] + prediction = model.predict training_dataset.substances[10] + p prediction + refute_nil prediction[:value] + # TODO test predictin end def test_nanoparticle_default @@ -78,8 +74,7 @@ class ModelTest < MiniTest::Test training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first end model = Model::Lazar.create training_dataset: training_dataset - assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method] - assert_equal "rf", model.algorithms[:prediction][:parameters] + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method] prediction = model.predict training_dataset.substances[14] assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." @@ -99,7 +94,7 @@ class ModelTest < MiniTest::Test training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Caret.regression", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] assert_equal 0.1, model.algorithms[:similarity][:min] assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method] @@ -111,10 +106,7 @@ class ModelTest < MiniTest::Test def test_default_classification algorithms = { - :descriptors => { - :method => "fingerprint", - :type => 'MP2D', - }, + :descriptors => [ "MP2D" ], :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -135,10 +127,7 @@ class ModelTest < MiniTest::Test def test_classification_parameters algorithms = { - :descriptors => { - :method => "fingerprint", - :type => 'MACCS', - }, + :descriptors => ['MACCS'], :similarity => { :min => 0.4 }, diff --git a/test/regression.rb b/test/regression.rb index 4c21450..aad4195 100644 --- a/test/regression.rb +++ b/test/regression.rb @@ -45,12 +45,45 @@ class LazarRegressionTest < MiniTest::Test end def test_local_physchem_regression - skip # TODO: fix training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - model = Model::Lazar.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") + algorithms = { + :descriptors => ["PhysChem::OPENBABEL"], + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + } + model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms) + p model compound = Compound.from_smiles "NC(=O)OCCC" prediction = model.predict compound refute_nil prediction[:value] end + def test_local_physchem_regression_with_feature_selection + training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" + algorithms = { + :descriptors => { + :method => "calculated_properties", + :types => ["OPENBABEL"] + }, + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create(training_dataset.features.first, training_dataset, algorithms) + p model + compound = Compound.from_smiles "NC(=O)OCCC" + prediction = model.predict compound + refute_nil prediction[:value] + end + + def test_local_physchem_classification + skip + end + end -- cgit v1.2.3