summaryrefslogtreecommitdiff
path: root/lib/regression.rb
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-02-29 14:11:30 +0100
committerChristoph Helma <helma@in-silico.ch>2016-02-29 14:11:30 +0100
commit003332ad95dd4c63d0b7c00d22c73f460b163139 (patch)
tree5d0e442e3ccbebd9623a089812989682afb42631 /lib/regression.rb
parentc4b56b22fd6e65633deb7e52bd99865e3bee8f00 (diff)
modular regression algorithms
Diffstat (limited to 'lib/regression.rb')
-rw-r--r--lib/regression.rb269
1 files changed, 38 insertions, 231 deletions
diff --git a/lib/regression.rb b/lib/regression.rb
index 0694a68..c988542 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -22,7 +22,8 @@ module OpenTox
{:value => prediction,:confidence => confidence}
end
- def self.local_pls_regression compound, params
+ # TODO explicit neighbors, also for physchem
+ def self.local_fingerprint_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4"
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
activities = []
@@ -43,50 +44,35 @@ module OpenTox
end
end
- name = Feature.find(params[:prediction_feature_id]).name
- R.assign "activities", activities
- R.assign "weights", weights
variables = []
- data_frame = ["c(#{activities.join ","})"]
+ data_frame = [activities]
fingerprints.each do |k,v|
unless v.uniq.size == 1
- data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))"
+ data_frame << v.collect{|m| m ? "T" : "F"}
variables << k
end
end
+
if variables.empty?
result = weighted_average(compound, params)
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
return result
- return {:value => nil, :confidence => nil} # TODO confidence
+
else
- R.eval "data <- data.frame(#{data_frame.join ","})"
- R.assign "features", variables
- R.eval "names(data) <- append(c('activities'),features)" #
- begin
- R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)"
- rescue # fall back to weighted average
- result = weighted_average(compound, params)
- result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
- return result
+ compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
+ prediction = r_model_prediction algorithm, algorithm_params, data_frame, variables, weights, compound_features
+ if prediction.nil?
+ prediction = weighted_average(compound, params)
+ prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
+ return prediction
+ else
+ return {:value => 10**prediction, :confidence => 1} # TODO confidence
end
- #begin
- #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX
- compound_features = variables.collect{|f| compound.fingerprint.include? f }
- R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
- R.eval "names(fingerprint) <- features" #
- R.eval "prediction <- predict(model,fingerprint)"
- prediction = 10**R.eval("prediction").to_f
- return {:value => prediction, :confidence => 1} # TODO confidence
- #rescue
- #p "Prediction failed"
- #return {:value => nil, :confidence => nil} # TODO confidence
- #end
end
end
- def self.local_physchem_regression compound, params
+ def self.local_physchem_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4"
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
@@ -117,218 +103,39 @@ module OpenTox
result = weighted_average(compound, params)
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
return result
- else
- name = Feature.find(params[:prediction_feature_id]).name
- R.assign "weights", weights
- data_frame = ["c(#{activities.join ","})"]
- physchem.keys.each do |pid|
- data_frame << "c(#{physchem[pid].join ","})"
- end
- R.eval "data <- data.frame(#{data_frame.join ","})"
- R.assign "features", physchem.keys
- R.eval "names(data) <- append(c('activities'),features)" #
- begin
- R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)"
- rescue # fall back to weighted average
- result = weighted_average(compound, params)
- result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
- return result
+ else
+ data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
+ prediction = r_model_prediction algorithm, algorithm_params, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
+ if prediction.nil?
+ prediction = weighted_average(compound, params)
+ prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
+ return prediction
+ else
+ return {:value => 10**prediction, :confidence => 1} # TODO confidence
end
- compound_features = physchem.keys.collect{|pid| compound.physchem[pid]}
- R.eval "fingerprint <- rbind(c(#{compound_features.join ','}))"
- R.eval "names(fingerprint) <- features" #
- R.eval "prediction <- predict(model,fingerprint)"
- prediction = 10**R.eval("prediction").to_f
- return {:value => prediction, :confidence => 1} # TODO confidence
end
end
- def self.weighted_average_with_relevant_fingerprints neighbors
- weighted_sum = 0.0
- sim_sum = 0.0
- fingerprint_features = []
- neighbors.each do |row|
- n,sim,acts = row
- neighbor = Compound.find n
- fingerprint_features += neighbor.fp4
- end
- fingerprint_features.uniq!
- p fingerprint_features
-=begin
- p n
- acts.each do |act|
- weighted_sum += sim*Math.log10(act)
- sim_sum += sim
- end
- end
-=end
- confidence = sim_sum/neighbors.size.to_f
- sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
- {:value => prediction,:confidence => confidence}
- end
-
- # Local support vector regression from neighbors
- # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
- # @return [Numeric] A prediction value.
- def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
-
- confidence = 0.0
- prediction = nil
-
- $logger.debug "Local SVM."
- props = neighbors.collect{|row| row[3] }
- neighbors.shift
- activities = neighbors.collect{|n| n[2]}
- prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
- prediction = nil if (!prediction.nil? && prediction.infinite?)
- $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
- if prediction
- confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
- else
- confidence = nil if prediction.nil?
+ def self.r_model_prediction algorithm, params, training_data, training_features, training_weights, query_feature_values
+ R.assign "weights", training_weights
+ r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+ R.eval "data <- #{r_data_frame}"
+ R.assign "features", training_features
+ R.eval "names(data) <- append(c('activities'),features)" #
+ begin
+ R.eval "model <- #{algorithm}(activities ~ .,data = data, weights = weights, #{params})"
+ rescue
+ return nil
end
- [prediction, confidence]
-
+ R.eval "fingerprint <- rbind(c(#{query_feature_values.join ','}))"
+ R.eval "names(fingerprint) <- features"
+ R.eval "prediction <- predict(model,fingerprint)"
+ R.eval("prediction").to_f
end
-
- # Local support vector prediction from neighbors.
- # Uses propositionalized setting.
- # Not to be called directly (use local_svm_regression or local_svm_classification).
- # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
- # @param [Array] activities, activities for neighbors.
- # @param [Float] min_train_performance, parameter to control censoring
- # @return [Numeric] A prediction value.
- def self.local_svm_prop(props, activities, min_train_performance)
-
- $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
- n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
- q_prop = props[0] # is an Array.
-
- prediction = nil
- if activities.uniq.size == 1
- prediction = activities[0]
- else
- t = Time.now
- #$logger.debug gram_matrix.to_yaml
- #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
- @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
- rs = []
- ["caret", "doMC", "class"].each do |lib|
- #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
- rs << "suppressPackageStartupMessages(library('#{lib}'))"
- end
- #@r.eval "registerDoMC()" # switch on parallel processing
- rs << "registerDoMC()" # switch on parallel processing
- #@r.eval "set.seed(1)"
- rs << "set.seed(1)"
- $logger.debug "Loading R packages: #{Time.now-t}"
- t = Time.now
- p n_prop
- begin
-
- # set data
- rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
- rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
- rs << "n_prop_x_size <- c(#{n_prop.size})"
- rs << "n_prop_y_size <- c(#{n_prop[0].size})"
- rs << "y <- c(#{activities.join(',')})"
- rs << "q_prop <- c(#{q_prop.join(',')})"
- rs << "y = matrix(y)"
- rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
- rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
-
- $logger.debug "Setting R data: #{Time.now-t}"
- t = Time.now
- # prepare data
- rs << "
- weights=NULL
- if (!(class(y) == 'numeric')) {
- y = factor(y)
- weights=unlist(as.list(prop.table(table(y))))
- weights=(weights-1)^2
- }
- "
-
- rs << "
- rem = nearZeroVar(prop_matrix)
- if (length(rem) > 0) {
- prop_matrix = prop_matrix[,-rem,drop=F]
- q_prop = q_prop[,-rem,drop=F]
- }
- rem = findCorrelation(cor(prop_matrix))
- if (length(rem) > 0) {
- prop_matrix = prop_matrix[,-rem,drop=F]
- q_prop = q_prop[,-rem,drop=F]
- }
- "
-
- #p @r.eval("y").to_ruby
- #p "weights"
- #p @r.eval("weights").to_ruby
- $logger.debug "Preparing R data: #{Time.now-t}"
- t = Time.now
- # model + support vectors
- #train_success = @r.eval <<-EOR
- rs << '
- model = train(prop_matrix,y,
- method="svmRadial",
- preProcess=c("center", "scale"),
- class.weights=weights,
- trControl=trainControl(method="LGOCV",number=10),
- tuneLength=8
- )
- perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
- '
- File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
- p rs.join("\n")
- p `Rscript /tmp/r.r`
-=begin
- @r.void_eval <<-EOR
- model = train(prop_matrix,y,
- method="svmRadial",
- #preProcess=c("center", "scale"),
- #class.weights=weights,
- #trControl=trainControl(method="LGOCV",number=10),
- #tuneLength=8
- )
- perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
- EOR
-=end
-
- $logger.debug "Creating R SVM model: #{Time.now-t}"
- t = Time.now
- if train_success
- # prediction
- @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
- #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
- @r.eval "if (class(y)!='numeric') p = as.character(p)"
- prediction = @r.p
-
- # censoring
- prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
- prediction = nil if prediction =~ /NA/
- $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
- else
- $logger.debug "Model creation failed."
- prediction = nil
- end
- $logger.debug "R Prediction: #{Time.now-t}"
- rescue Exception => e
- $logger.debug "#{e.class}: #{e.message}"
- $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
- ensure
- #puts @r.inspect
- #TODO: broken pipe
- #@r.quit # free R
- end
- end
- prediction
- end
end
-
end
end