From ca2bb0f90335b1f2c4ecc28ee423e85b281ffcf0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 Nov 2015 17:50:17 +0100 Subject: neighbor search delegated to database backend --- lib/regression.rb | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 868c25f..575a1ef 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,39 +1,26 @@ -# TODO install R packages kernlab, caret, doMC, class, e1071 - - - # log transform activities (create new dataset) - # scale, normalize features, might not be necessary - # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is - # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression - # zero-order correlation and the semi-partial correlation - # seems to be necessary for svm - # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 - # http://stackoverflow.com/questions/15436367/svm-scaling-input-values - # use lasso or elastic net?? - # select relevant features - # remove features with a single value - # remove correlated features - # remove features not correlated with endpoint module OpenTox module Algorithm class Regression def self.weighted_average compound, params + #p params.keys weighted_sum = 0.0 sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] activities = [] neighbors.each do |row| - n,sim,acts = row - confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors - acts.each do |act| - weighted_sum += sim*Math.log10(act) - activities << act - sim_sum += sim - end + #if row["dataset_ids"].include? params[:training_dataset_id] + sim = row["tanimoto"] + confidence = sim if sim > confidence # distance to nearest neighbor + # TODO add LOO errors + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + activities << act + sim_sum += sim + end + #end end #R.assign "activities", activities #R.eval "cv = cv(activities)" @@ -47,10 +34,8 @@ module OpenTox end def self.local_linear_regression compound, neighbors - p neighbors.size return nil unless neighbors.size > 0 features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq - p features training_data = Array.new(neighbors.size){Array.new(features.size,0)} neighbors.each_with_index do |n,i| #p n.first -- cgit v1.2.3 From f61b7d3c65d084747dc1bf87214e5ec0c57326be Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 9 Feb 2016 11:04:00 +0100 Subject: pls regression --- lib/regression.rb | 67 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 22 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 575a1ef..7c64d8f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,7 +9,7 @@ module OpenTox sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] - activities = [] + #activities = [] neighbors.each do |row| #if row["dataset_ids"].include? params[:training_dataset_id] sim = row["tanimoto"] @@ -17,7 +17,7 @@ module OpenTox # TODO add LOO errors row["features"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) - activities << act + #activities << act # TODO: Transformation?? sim_sum += sim end #end @@ -33,28 +33,51 @@ module OpenTox {:value => prediction,:confidence => confidence} end - def self.local_linear_regression compound, neighbors - return nil unless neighbors.size > 0 - features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq - training_data = Array.new(neighbors.size){Array.new(features.size,0)} - neighbors.each_with_index do |n,i| - #p n.first - neighbor = Compound.find n.first - features.each_with_index do |f,j| - training_data[i][j] = 1 if neighbor.fp4.include? f + def self.local_pls_regression compound, params + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil} unless neighbors.size > 0 + activities = [] + fingerprints = {} + weights = [] + fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + fingerprint = neighbor.fingerprint + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] + fingerprint_ids.each_with_index do |id,j| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) + end + end + end + + name = Feature.find(params[:prediction_feature_id]).name + R.assign "activities", activities + R.assign "weights", weights + variables = [] + data_frame = ["c(#{activities.join ","})"] + fingerprints.each do |k,v| + unless v.uniq.size == 1 + data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" + variables << "'#{k}'" end end - p training_data - - R.assign "activities", neighbors.collect{|n| n[2].median} - R.assign "features", training_data - R.eval "model <- lm(activities ~ features)" - R.eval "summary <- summary(model)" - p R.summary - compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0} - R.assign "compound_features", compound_features - R.eval "prediction <- predict(model,compound_features)" - p R.prediction + begin + R.eval "data <- data.frame(#{data_frame.join ","})" + R.eval "names(data) <- c('activities',#{variables.join ','})" + R.eval "model <- plsr(activities ~ .,data = data, ncomp = 3, weights = weights)" + compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } + R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" + R.eval "names(fingerprint) <- c(#{variables.join ','})" + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + {:value => prediction, :confidence => 1} # TODO confidence + rescue + {:value => nil, :confidence => nil} # TODO confidence + end end -- cgit v1.2.3 From e778475c578f13f30af4437845716d7e781c2609 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 13 Feb 2016 13:15:29 +0100 Subject: improved handling of duplicates in validations --- lib/regression.rb | 62 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 7c64d8f..2b41851 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -4,23 +4,19 @@ module OpenTox class Regression def self.weighted_average compound, params - #p params.keys weighted_sum = 0.0 sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] - #activities = [] neighbors.each do |row| - #if row["dataset_ids"].include? params[:training_dataset_id] - sim = row["tanimoto"] - confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors - row["features"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) - #activities << act # TODO: Transformation?? - sim_sum += sim - end - #end + sim = row["tanimoto"] + confidence = sim if sim > confidence # distance to nearest neighbor + # TODO add LOO errors + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + #activities << act # TODO: Transformation?? + sim_sum += sim + end end #R.assign "activities", activities #R.eval "cv = cv(activities)" @@ -35,7 +31,7 @@ module OpenTox def self.local_pls_regression compound, params neighbors = params[:neighbors] - return {:value => nil, :confidence => nil} unless neighbors.size > 0 + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 activities = [] fingerprints = {} weights = [] @@ -62,21 +58,37 @@ module OpenTox fingerprints.each do |k,v| unless v.uniq.size == 1 data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" - variables << "'#{k}'" + variables << k end end - begin + if variables.empty? + result = weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result + return {:value => nil, :confidence => nil} # TODO confidence + else R.eval "data <- data.frame(#{data_frame.join ","})" - R.eval "names(data) <- c('activities',#{variables.join ','})" - R.eval "model <- plsr(activities ~ .,data = data, ncomp = 3, weights = weights)" - compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } - R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" - R.eval "names(fingerprint) <- c(#{variables.join ','})" - R.eval "prediction <- predict(model,fingerprint)" - prediction = 10**R.eval("prediction").to_f - {:value => prediction, :confidence => 1} # TODO confidence - rescue - {:value => nil, :confidence => nil} # TODO confidence + R.assign "features", variables + R.eval "names(data) <- append(c('activities'),features)" # + begin + R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" + rescue # fall back to weighted average + result = weighted_average(compound, params) + result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return result + end + #begin + #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX + compound_features = variables.collect{|f| compound.fingerprint.include? f } + R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" + R.eval "names(fingerprint) <- features" # + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + return {:value => prediction, :confidence => 1} # TODO confidence + #rescue + #p "Prediction failed" + #return {:value => nil, :confidence => nil} # TODO confidence + #end end end -- cgit v1.2.3 From b90720cc26d789a96fa6f7a054fe06fc8b4ef33d Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 27 Feb 2016 16:47:48 +0100 Subject: local pls regression as default regression algorithm --- lib/regression.rb | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 2b41851..10a1861 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -93,6 +93,70 @@ module OpenTox end + def self.local_physchem_regression compound, params + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + activities = [] + fingerprints = {} + weights = [] + fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + fingerprint = neighbor.fingerprint + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] + fingerprint_ids.each_with_index do |id,j| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) + end + end + end + + name = Feature.find(params[:prediction_feature_id]).name + R.assign "activities", activities + R.assign "weights", weights + variables = [] + data_frame = ["c(#{activities.join ","})"] + fingerprints.each do |k,v| + unless v.uniq.size == 1 + data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" + variables << k + end + end + if variables.empty? + result = weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result + return {:value => nil, :confidence => nil} # TODO confidence + else + R.eval "data <- data.frame(#{data_frame.join ","})" + R.assign "features", variables + R.eval "names(data) <- append(c('activities'),features)" # + begin + R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" + rescue # fall back to weighted average + result = weighted_average(compound, params) + result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return result + end + #begin + #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX + compound_features = variables.collect{|f| compound.fingerprint.include? f } + R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" + R.eval "names(fingerprint) <- features" # + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + return {:value => prediction, :confidence => 1} # TODO confidence + #rescue + #p "Prediction failed" + #return {:value => nil, :confidence => nil} # TODO confidence + #end + end + + end + def self.weighted_average_with_relevant_fingerprints neighbors weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3 From 72f6cd966a249859e009a0db5f7b089aad1d6511 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 29 Feb 2016 08:59:43 +0100 Subject: regression crossvalidation fixed --- lib/regression.rb | 74 +++++++++++++++++++++++-------------------------------- 1 file changed, 31 insertions(+), 43 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 10a1861..0694a68 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,6 +1,7 @@ module OpenTox module Algorithm + # TODO add LOO errors class Regression def self.weighted_average compound, params @@ -11,19 +12,11 @@ module OpenTox neighbors.each do |row| sim = row["tanimoto"] confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors row["features"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) - #activities << act # TODO: Transformation?? sim_sum += sim end end - #R.assign "activities", activities - #R.eval "cv = cv(activities)" - #confidence /= activities.standard_deviation#/activities.mean - #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size] - #confidence = sim_sum/neighbors.size.to_f - #confidence = neighbors.size.to_f confidence = 0 if confidence.nan? sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) {:value => prediction,:confidence => confidence} @@ -94,45 +87,46 @@ module OpenTox end def self.local_physchem_regression compound, params + neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + activities = [] - fingerprints = {} weights = [] - fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + physchem = {} neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] - fingerprint = neighbor.fingerprint row["features"][params[:prediction_feature_id].to_s].each do |act| activities << Math.log10(act) - weights << row["tanimoto"] - fingerprint_ids.each_with_index do |id,j| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) + weights << row["tanimoto"] # TODO cosine ? + neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + physchem[pid] ||= [] + physchem[pid] << v end end end - name = Feature.find(params[:prediction_feature_id]).name - R.assign "activities", activities - R.assign "weights", weights - variables = [] - data_frame = ["c(#{activities.join ","})"] - fingerprints.each do |k,v| - unless v.uniq.size == 1 - data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" - variables << k - end + # remove properties with a single value + physchem.each do |pid,v| + physchem.delete(pid) if v.uniq.size <= 1 end - if variables.empty? - result = weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result - return {:value => nil, :confidence => nil} # TODO confidence + + if physchem.empty? + result = weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result else + + name = Feature.find(params[:prediction_feature_id]).name + R.assign "weights", weights + data_frame = ["c(#{activities.join ","})"] + physchem.keys.each do |pid| + data_frame << "c(#{physchem[pid].join ","})" + end R.eval "data <- data.frame(#{data_frame.join ","})" - R.assign "features", variables + R.assign "features", physchem.keys R.eval "names(data) <- append(c('activities'),features)" # begin R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" @@ -141,18 +135,12 @@ module OpenTox result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return result end - #begin - #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX - compound_features = variables.collect{|f| compound.fingerprint.include? f } - R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" - R.eval "names(fingerprint) <- features" # - R.eval "prediction <- predict(model,fingerprint)" - prediction = 10**R.eval("prediction").to_f - return {:value => prediction, :confidence => 1} # TODO confidence - #rescue - #p "Prediction failed" - #return {:value => nil, :confidence => nil} # TODO confidence - #end + compound_features = physchem.keys.collect{|pid| compound.physchem[pid]} + R.eval "fingerprint <- rbind(c(#{compound_features.join ','}))" + R.eval "names(fingerprint) <- features" # + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + return {:value => prediction, :confidence => 1} # TODO confidence end end -- cgit v1.2.3 From 003332ad95dd4c63d0b7c00d22c73f460b163139 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 29 Feb 2016 14:11:30 +0100 Subject: modular regression algorithms --- lib/regression.rb | 269 ++++++++---------------------------------------------- 1 file changed, 38 insertions(+), 231 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 0694a68..c988542 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -22,7 +22,8 @@ module OpenTox {:value => prediction,:confidence => confidence} end - def self.local_pls_regression compound, params + # TODO explicit neighbors, also for physchem + def self.local_fingerprint_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 activities = [] @@ -43,50 +44,35 @@ module OpenTox end end - name = Feature.find(params[:prediction_feature_id]).name - R.assign "activities", activities - R.assign "weights", weights variables = [] - data_frame = ["c(#{activities.join ","})"] + data_frame = [activities] fingerprints.each do |k,v| unless v.uniq.size == 1 - data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" + data_frame << v.collect{|m| m ? "T" : "F"} variables << k end end + if variables.empty? result = weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result - return {:value => nil, :confidence => nil} # TODO confidence + else - R.eval "data <- data.frame(#{data_frame.join ","})" - R.assign "features", variables - R.eval "names(data) <- append(c('activities'),features)" # - begin - R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" - rescue # fall back to weighted average - result = weighted_average(compound, params) - result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return result + compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} + prediction = r_model_prediction algorithm, algorithm_params, data_frame, variables, weights, compound_features + if prediction.nil? + prediction = weighted_average(compound, params) + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return prediction + else + return {:value => 10**prediction, :confidence => 1} # TODO confidence end - #begin - #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX - compound_features = variables.collect{|f| compound.fingerprint.include? f } - R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" - R.eval "names(fingerprint) <- features" # - R.eval "prediction <- predict(model,fingerprint)" - prediction = 10**R.eval("prediction").to_f - return {:value => prediction, :confidence => 1} # TODO confidence - #rescue - #p "Prediction failed" - #return {:value => nil, :confidence => nil} # TODO confidence - #end end end - def self.local_physchem_regression compound, params + def self.local_physchem_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -117,218 +103,39 @@ module OpenTox result = weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result - else - name = Feature.find(params[:prediction_feature_id]).name - R.assign "weights", weights - data_frame = ["c(#{activities.join ","})"] - physchem.keys.each do |pid| - data_frame << "c(#{physchem[pid].join ","})" - end - R.eval "data <- data.frame(#{data_frame.join ","})" - R.assign "features", physchem.keys - R.eval "names(data) <- append(c('activities'),features)" # - begin - R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" - rescue # fall back to weighted average - result = weighted_average(compound, params) - result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return result + else + data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } + prediction = r_model_prediction algorithm, algorithm_params, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + if prediction.nil? + prediction = weighted_average(compound, params) + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return prediction + else + return {:value => 10**prediction, :confidence => 1} # TODO confidence end - compound_features = physchem.keys.collect{|pid| compound.physchem[pid]} - R.eval "fingerprint <- rbind(c(#{compound_features.join ','}))" - R.eval "names(fingerprint) <- features" # - R.eval "prediction <- predict(model,fingerprint)" - prediction = 10**R.eval("prediction").to_f - return {:value => prediction, :confidence => 1} # TODO confidence end end - def self.weighted_average_with_relevant_fingerprints neighbors - weighted_sum = 0.0 - sim_sum = 0.0 - fingerprint_features = [] - neighbors.each do |row| - n,sim,acts = row - neighbor = Compound.find n - fingerprint_features += neighbor.fp4 - end - fingerprint_features.uniq! - p fingerprint_features -=begin - p n - acts.each do |act| - weighted_sum += sim*Math.log10(act) - sim_sum += sim - end - end -=end - confidence = sim_sum/neighbors.size.to_f - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) - {:value => prediction,:confidence => confidence} - end - - # Local support vector regression from neighbors - # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required - # @return [Numeric] A prediction value. - def self.local_svm_regression neighbors, params={:min_train_performance => 0.1} - - confidence = 0.0 - prediction = nil - - $logger.debug "Local SVM." - props = neighbors.collect{|row| row[3] } - neighbors.shift - activities = neighbors.collect{|n| n[2]} - prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting - prediction = nil if (!prediction.nil? && prediction.infinite?) - $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')." - if prediction - confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities}) - else - confidence = nil if prediction.nil? + def self.r_model_prediction algorithm, params, training_data, training_features, training_weights, query_feature_values + R.assign "weights", training_weights + r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", training_features + R.eval "names(data) <- append(c('activities'),features)" # + begin + R.eval "model <- #{algorithm}(activities ~ .,data = data, weights = weights, #{params})" + rescue + return nil end - [prediction, confidence] - + R.eval "fingerprint <- rbind(c(#{query_feature_values.join ','}))" + R.eval "names(fingerprint) <- features" + R.eval "prediction <- predict(model,fingerprint)" + R.eval("prediction").to_f end - - # Local support vector prediction from neighbors. - # Uses propositionalized setting. - # Not to be called directly (use local_svm_regression or local_svm_classification). - # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] - # @param [Array] activities, activities for neighbors. - # @param [Float] min_train_performance, parameter to control censoring - # @return [Numeric] A prediction value. - def self.local_svm_prop(props, activities, min_train_performance) - - $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)." - n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays. - q_prop = props[0] # is an Array. - - prediction = nil - if activities.uniq.size == 1 - prediction = activities[0] - else - t = Time.now - #$logger.debug gram_matrix.to_yaml - #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests - @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests - rs = [] - ["caret", "doMC", "class"].each do |lib| - #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))" - rs << "suppressPackageStartupMessages(library('#{lib}'))" - end - #@r.eval "registerDoMC()" # switch on parallel processing - rs << "registerDoMC()" # switch on parallel processing - #@r.eval "set.seed(1)" - rs << "set.seed(1)" - $logger.debug "Loading R packages: #{Time.now-t}" - t = Time.now - p n_prop - begin - - # set data - rs << "n_prop <- c(#{n_prop.flatten.join(',')})" - rs << "n_prop <- c(#{n_prop.flatten.join(',')})" - rs << "n_prop_x_size <- c(#{n_prop.size})" - rs << "n_prop_y_size <- c(#{n_prop[0].size})" - rs << "y <- c(#{activities.join(',')})" - rs << "q_prop <- c(#{q_prop.join(',')})" - rs << "y = matrix(y)" - rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" - rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)" - - $logger.debug "Setting R data: #{Time.now-t}" - t = Time.now - # prepare data - rs << " - weights=NULL - if (!(class(y) == 'numeric')) { - y = factor(y) - weights=unlist(as.list(prop.table(table(y)))) - weights=(weights-1)^2 - } - " - - rs << " - rem = nearZeroVar(prop_matrix) - if (length(rem) > 0) { - prop_matrix = prop_matrix[,-rem,drop=F] - q_prop = q_prop[,-rem,drop=F] - } - rem = findCorrelation(cor(prop_matrix)) - if (length(rem) > 0) { - prop_matrix = prop_matrix[,-rem,drop=F] - q_prop = q_prop[,-rem,drop=F] - } - " - - #p @r.eval("y").to_ruby - #p "weights" - #p @r.eval("weights").to_ruby - $logger.debug "Preparing R data: #{Time.now-t}" - t = Time.now - # model + support vectors - #train_success = @r.eval <<-EOR - rs << ' - model = train(prop_matrix,y, - method="svmRadial", - preProcess=c("center", "scale"), - class.weights=weights, - trControl=trainControl(method="LGOCV",number=10), - tuneLength=8 - ) - perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) - ' - File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")} - p rs.join("\n") - p `Rscript /tmp/r.r` -=begin - @r.void_eval <<-EOR - model = train(prop_matrix,y, - method="svmRadial", - #preProcess=c("center", "scale"), - #class.weights=weights, - #trControl=trainControl(method="LGOCV",number=10), - #tuneLength=8 - ) - perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) - EOR -=end - - $logger.debug "Creating R SVM model: #{Time.now-t}" - t = Time.now - if train_success - # prediction - @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice - #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice - @r.eval "if (class(y)!='numeric') p = as.character(p)" - prediction = @r.p - - # censoring - prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) - prediction = nil if prediction =~ /NA/ - $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" - else - $logger.debug "Model creation failed." - prediction = nil - end - $logger.debug "R Prediction: #{Time.now-t}" - rescue Exception => e - $logger.debug "#{e.class}: #{e.message}" - $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - ensure - #puts @r.inspect - #TODO: broken pipe - #@r.quit # free R - end - end - prediction - end end - end end -- cgit v1.2.3 From 24b1524f20eccd3bfd59171f1f7151fcc272a427 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 10:06:22 +0100 Subject: folds split on unique compounds instead of data entries --- lib/regression.rb | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index c988542..2bf8915 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -4,7 +4,7 @@ module OpenTox # TODO add LOO errors class Regression - def self.weighted_average compound, params + def self.local_weighted_average compound, params weighted_sum = 0.0 sim_sum = 0.0 confidence = 0.0 @@ -23,7 +23,8 @@ module OpenTox end # TODO explicit neighbors, also for physchem - def self.local_fingerprint_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4" + #def self.local_fingerprint_regression compound, params, method="pls", method_params="ncomp = 4" + def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 activities = [] @@ -54,25 +55,27 @@ module OpenTox end if variables.empty? - result = weighted_average(compound, params) + result = local_weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction algorithm, algorithm_params, data_frame, variables, weights, compound_features + prediction = r_model_prediction method, data_frame, variables, weights, compound_features if prediction.nil? - prediction = weighted_average(compound, params) + prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - return {:value => 10**prediction, :confidence => 1} # TODO confidence + prediction[:value] = 10**prediction[:value] + prediction[:rmse] = 10**prediction[:rmse] + prediction end end end - def self.local_physchem_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4" + def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -100,39 +103,44 @@ module OpenTox end if physchem.empty? - result = weighted_average(compound, params) + result = local_weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } - prediction = r_model_prediction algorithm, algorithm_params, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} if prediction.nil? - prediction = weighted_average(compound, params) + prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - return {:value => 10**prediction, :confidence => 1} # TODO confidence + prediction[:value] = 10**prediction[:value] + prediction end end end - def self.r_model_prediction algorithm, params, training_data, training_features, training_weights, query_feature_values + def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # begin - R.eval "model <- #{algorithm}(activities ~ .,data = data, weights = weights, #{params})" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"#, #{params}" rescue return nil end - R.eval "fingerprint <- rbind(c(#{query_feature_values.join ','}))" + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" - R.eval("prediction").to_f + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } end end -- cgit v1.2.3 From 0c5d2e678908a2d4aea43efbedbedc2c0439be30 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 15:25:50 +0100 Subject: descriptor tests --- lib/regression.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 2bf8915..e0b109e 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -23,7 +23,6 @@ module OpenTox end # TODO explicit neighbors, also for physchem - #def self.local_fingerprint_regression compound, params, method="pls", method_params="ncomp = 4" def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -129,7 +128,7 @@ module OpenTox R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # begin - R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"#, #{params}" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" rescue return nil end -- cgit v1.2.3 From 7c3bd90c26dfeea2db3cf74a1cefc23d8dece7c0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 15 Mar 2016 17:40:40 +0100 Subject: validation tests pass --- lib/regression.rb | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index e0b109e..b8efd30 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,25 +1,23 @@ module OpenTox module Algorithm - # TODO add LOO errors class Regression def self.local_weighted_average compound, params weighted_sum = 0.0 sim_sum = 0.0 - confidence = 0.0 neighbors = params[:neighbors] neighbors.each do |row| sim = row["tanimoto"] - confidence = sim if sim > confidence # distance to nearest neighbor - row["features"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) - sim_sum += sim + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + sim_sum += sim + end end end - confidence = 0 if confidence.nan? sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) - {:value => prediction,:confidence => confidence} + {:value => prediction} end # TODO explicit neighbors, also for physchem @@ -31,15 +29,18 @@ module OpenTox weights = [] fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + #p neighbors neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] - fingerprint_ids.each_with_index do |id,j| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] + fingerprint_ids.each_with_index do |id,j| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) + end end end end @@ -86,12 +87,14 @@ module OpenTox neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] # TODO cosine ? - neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity - physchem[pid] ||= [] - physchem[pid] << v + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] # TODO cosine ? + neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + physchem[pid] ||= [] + physchem[pid] << v + end end end end -- cgit v1.2.3 From abc3526e318a2bfa24dfe033d8879e7657c2ae5c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 15 Mar 2016 18:46:34 +0100 Subject: single tests pass --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index b8efd30..6b08fd8 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -62,7 +62,7 @@ module OpenTox else compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} prediction = r_model_prediction method, data_frame, variables, weights, compound_features - if prediction.nil? + if prediction.nil? or prediction[:value].nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction -- cgit v1.2.3 From 2b0a7c725b23d8ef3f525b25fc7105de57ee3897 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 15 Mar 2016 18:53:12 +0100 Subject: validation test cleanup --- lib/regression.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 6b08fd8..af72d7d 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -29,7 +29,6 @@ module OpenTox weights = [] fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort - #p neighbors neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint -- cgit v1.2.3 From 130524b0efa98f6e63d39c55e2f643130459ceee Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 23 Mar 2016 11:46:47 +0100 Subject: prediction interval for regression --- lib/regression.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index af72d7d..5021fb3 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -66,6 +66,7 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else + prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] prediction[:value] = 10**prediction[:value] prediction[:rmse] = 10**prediction[:rmse] prediction -- cgit v1.2.3