From 91787edb3682900bc5a2feeca66e5142f387fcc6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 7 Oct 2016 10:25:58 +0200 Subject: unified interface for prediction algorithms --- lib/caret.rb | 152 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/classification.rb | 2 +- lib/crossvalidation.rb | 4 +- lib/dataset.rb | 2 - lib/feature.rb | 18 +++--- lib/import.rb | 3 +- lib/nanoparticle.rb | 50 ---------------- lib/physchem.rb | 6 +- lib/regression.rb | 2 +- 9 files changed, 169 insertions(+), 70 deletions(-) create mode 100644 lib/caret.rb (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb new file mode 100644 index 0000000..b999b06 --- /dev/null +++ b/lib/caret.rb @@ -0,0 +1,152 @@ +module OpenTox + module Algorithm + + class Caret + # TODO classification + # model list: https://topepo.github.io/caret/modelList.html + + attr_accessor :descriptors, :neighbors, :method, :relevant_features, :data_frame, :feature_names, :weights, :query_features + + def initialize descriptors:, neighbors:, method:, relevant_features: + @descriptors = descriptors + @neighbors = neighbors + @method = method + @relevant_features = relevant_features + end + + def self.regression descriptors:, neighbors:, method:, relevant_features:nil + + caret = new(descriptors:descriptors, neighbors:neighbors, method:method, relevant_features:relevant_features) + # collect training data for R + if descriptors.is_a? Array + caret.fingerprint2R + elsif descriptors.is_a? Hash + caret.properties2R + else + bad_request_error "Descriptors should be a fingerprint (Array) or properties (Hash). Cannot handle '#{descriptors.class}'." + end + if caret.feature_names.empty? or caret.data_frame.flatten.uniq == ["NA"] + prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + else + prediction = caret.r_model_prediction + if prediction.nil? or prediction[:value].nil? + prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + end + end + prediction + + end + + def fingerprint2R + + values = [] + features = {} + @weights = [] + descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort + + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| + values << act + @weights << n["similarity"] + descriptor_ids.each do |id| + features[id] ||= [] + features[id] << n["descriptors"].include?(id) + end + end if activities + end + + @feature_names = [] + @data_frame = [values] + + features.each do |k,v| + unless v.uniq.size == 1 + @data_frame << v.collect{|m| m ? "T" : "F"} + @feature_names << k + end + end + @query_features = @feature_names.collect{|f| descriptors.include?(f) ? "T" : "F"} + + end + + + def properties2R + + @weights = [] + @feature_names = [] + @query_features = [] + + # keep only descriptors with values + @relevant_features.keys.each_with_index do |f,i| + if @descriptors[f] + @feature_names << f + @query_features << @descriptors[f].median + else + neighbors.each do |n| + n["descriptors"].delete_at i + end + end + end + + measurements = neighbors.collect{|n| n["measurements"]}.flatten + # initialize data frame with 'NA' defaults + @data_frame = Array.new(@feature_names.size+1){Array.new(measurements.size,"NA") } + + i = 0 + # parse neighbor activities and descriptors + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| # multiple measurements are treated as separate instances + unless n["descriptors"].include?(nil) + data_frame[0][i] = act + @weights << n["similarity"] + n["descriptors"].each_with_index do |d,j| + @data_frame[j+1][i] = d + end + i += 1 + end + end if activities # ignore neighbors without measurements + end + + end + + def r_model_prediction + begin + R.assign "weights", @weights + r_data_frame = "data.frame(#{@data_frame.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", @feature_names + R.eval "names(data) <- append(c('activities'),features)" # + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" + rescue => e + $logger.debug "R caret model creation error for:" + $logger.debug JSON.pretty_generate(self.inspect) + return nil + end + begin + R.eval "query <- data.frame(rbind(c(#{@query_features.join ','})))" + R.eval "names(query) <- features" + R.eval "prediction <- predict(model,query)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse + { + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval + } + rescue => e + $logger.debug "R caret prediction error for:" + $logger.debug self.inspect + return nil + end + end + + end + end +end + diff --git a/lib/classification.rb b/lib/classification.rb index 01ba878..6582e7d 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,7 +3,7 @@ module OpenTox class Classification - def self.weighted_majority_vote descriptors:nil, neighbors: + def self.weighted_majority_vote descriptors:nil, neighbors:, method:nil, relevant_features:nil sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index d7a1f08..15d1031 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -16,10 +16,10 @@ module OpenTox folds: n ) cv.save # set created_at + nr_instances = 0 nr_unpredicted = 0 - #predictions = {} - training_dataset = Dataset.find model.training_dataset_id + training_dataset = model.training_dataset training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" diff --git a/lib/dataset.rb b/lib/dataset.rb index 2e21e5b..453fc35 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -165,11 +165,9 @@ module OpenTox feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - metadata["numeric"] = true numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else - metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) diff --git a/lib/feature.rb b/lib/feature.rb index c6fb68a..0ca4d41 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -2,30 +2,28 @@ module OpenTox # Basic feature class class Feature - field :nominal, type: Boolean - field :numeric, type: Boolean field :measured, type: Boolean field :calculated, type: Boolean field :category, type: String field :unit, type: String field :conditions, type: Hash + + def nominal? + self.class == NominalFeature + end + + def numeric? + self.class == NumericFeature + end end # Feature for categorical variables class NominalFeature < Feature field :accept_values, type: Array - def initialize params - super params - nominal = true - end end # Feature for quantitative variables class NumericFeature < Feature - def initialize params - super params - numeric = true - end end # Feature for SMARTS fragments diff --git a/lib/import.rb b/lib/import.rb index 17894a9..8e57401 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -76,7 +76,7 @@ module OpenTox if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source) + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset end else @@ -98,6 +98,7 @@ module OpenTox :category => category, :conditions => effect["conditions"], :source => study["protocol"]["category"]["term"], + :measured => true, :warnings => warnings ) nanoparticle.parse_ambit_value feature, effect["result"], dataset diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 6905f6f..f74f263 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,54 +8,6 @@ module OpenTox attr_accessor :scaled_values -=begin - def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features: - dataset = Dataset.find(dataset_id) - #relevant_features = {} - measurements = [] - substances = [] - # TODO: exclude query activities!!! - dataset.substances.each do |s| - if s.core == self.core # exclude nanoparticles with different core - dataset.values(s,prediction_feature_id).each do |act| - measurements << act - substances << s - end - end - end - neighbors = [] - substances.each do |substance| - values = dataset.values(substance,prediction_feature_id) - if values - common_descriptors = relevant_features.keys & substance.descriptors.keys - # scale values - query_descriptors = common_descriptors.collect{|d| (descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - @scaled_values = common_descriptors.collect{|d| [d,(descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - neighbor_descriptors = common_descriptors.collect{|d| (substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} - weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} - sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) - neighbors << { - "_id" => substance.id, - "measurements" => values, - "similarity" => sim, - "common_descriptors" => common_descriptors.collect do |id| - { - :id => id, - :scaled_value => neighbor_scaled_values[id], - :p_value => relevant_features[id]["p_value"], - :r_squared => relevant_features[id]["r"]**2} - end - } if sim >= min_sim - end - end - $logger.debug "#{self.name}: #{neighbors.size} neighbors" - neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} - neighbors - end -=end - def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand case feature.category @@ -78,8 +30,6 @@ module OpenTox end def parse_ambit_value feature, v, dataset - #p dataset - #p feature # TODO add study id to warnings v.delete "unit" # TODO: ppm instead of weights diff --git a/lib/physchem.rb b/lib/physchem.rb index 86300ba..c32e382 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -42,7 +42,7 @@ module OpenTox def self.descriptors desc=DESCRIPTORS desc.collect do |name,description| lib,desc = name.split('.',2) - self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end @@ -54,11 +54,11 @@ module OpenTox CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n| dname = "#{name}.#{n}" description = DESCRIPTORS[dname] - udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end else description = DESCRIPTORS[name] - udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end udesc diff --git a/lib/regression.rb b/lib/regression.rb index cf6d9cb..0e5e06b 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.weighted_average descriptors:nil, neighbors:, parameters:nil + def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3