diff options
Diffstat (limited to 'lib/regression.rb')
-rw-r--r-- | lib/regression.rb | 337 |
1 files changed, 113 insertions, 224 deletions
diff --git a/lib/regression.rb b/lib/regression.rb index 868c25f..5021fb3 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,262 +1,151 @@ -# TODO install R packages kernlab, caret, doMC, class, e1071 - - - # log transform activities (create new dataset) - # scale, normalize features, might not be necessary - # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is - # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression - # zero-order correlation and the semi-partial correlation - # seems to be necessary for svm - # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 - # http://stackoverflow.com/questions/15436367/svm-scaling-input-values - # use lasso or elastic net?? - # select relevant features - # remove features with a single value - # remove correlated features - # remove features not correlated with endpoint module OpenTox module Algorithm class Regression - def self.weighted_average compound, params + def self.local_weighted_average compound, params weighted_sum = 0.0 sim_sum = 0.0 - confidence = 0.0 neighbors = params[:neighbors] - activities = [] neighbors.each do |row| - n,sim,acts = row - confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors - acts.each do |act| - weighted_sum += sim*Math.log10(act) - activities << act - sim_sum += sim + sim = row["tanimoto"] + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + sim_sum += sim + end end end - #R.assign "activities", activities - #R.eval "cv = cv(activities)" - #confidence /= activities.standard_deviation#/activities.mean - #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size] - #confidence = sim_sum/neighbors.size.to_f - #confidence = neighbors.size.to_f - confidence = 0 if confidence.nan? sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) - {:value => prediction,:confidence => confidence} + {:value => prediction} end - def self.local_linear_regression compound, neighbors - p neighbors.size - return nil unless neighbors.size > 0 - features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq - p features - training_data = Array.new(neighbors.size){Array.new(features.size,0)} - neighbors.each_with_index do |n,i| - #p n.first - neighbor = Compound.find n.first - features.each_with_index do |f,j| - training_data[i][j] = 1 if neighbor.fp4.include? f + # TODO explicit neighbors, also for physchem + def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + activities = [] + fingerprints = {} + weights = [] + fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + fingerprint = neighbor.fingerprint + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] + fingerprint_ids.each_with_index do |id,j| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) + end + end end end - p training_data - - R.assign "activities", neighbors.collect{|n| n[2].median} - R.assign "features", training_data - R.eval "model <- lm(activities ~ features)" - R.eval "summary <- summary(model)" - p R.summary - compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0} - R.assign "compound_features", compound_features - R.eval "prediction <- predict(model,compound_features)" - p R.prediction - - end - def self.weighted_average_with_relevant_fingerprints neighbors - weighted_sum = 0.0 - sim_sum = 0.0 - fingerprint_features = [] - neighbors.each do |row| - n,sim,acts = row - neighbor = Compound.find n - fingerprint_features += neighbor.fp4 - end - fingerprint_features.uniq! - p fingerprint_features -=begin - p n - acts.each do |act| - weighted_sum += sim*Math.log10(act) - sim_sum += sim + variables = [] + data_frame = [activities] + fingerprints.each do |k,v| + unless v.uniq.size == 1 + data_frame << v.collect{|m| m ? "T" : "F"} + variables << k end end -=end - confidence = sim_sum/neighbors.size.to_f - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) - {:value => prediction,:confidence => confidence} - end - - # Local support vector regression from neighbors - # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required - # @return [Numeric] A prediction value. - def self.local_svm_regression neighbors, params={:min_train_performance => 0.1} - confidence = 0.0 - prediction = nil + if variables.empty? + result = local_weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result - $logger.debug "Local SVM." - props = neighbors.collect{|row| row[3] } - neighbors.shift - activities = neighbors.collect{|n| n[2]} - prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting - prediction = nil if (!prediction.nil? && prediction.infinite?) - $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')." - if prediction - confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities}) else - confidence = nil if prediction.nil? + compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} + prediction = r_model_prediction method, data_frame, variables, weights, compound_features + if prediction.nil? or prediction[:value].nil? + prediction = local_weighted_average(compound, params) + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return prediction + else + prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] + prediction[:value] = 10**prediction[:value] + prediction[:rmse] = 10**prediction[:rmse] + prediction + end end - [prediction, confidence] - + end + def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" - # Local support vector prediction from neighbors. - # Uses propositionalized setting. - # Not to be called directly (use local_svm_regression or local_svm_classification). - # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] - # @param [Array] activities, activities for neighbors. - # @param [Float] min_train_performance, parameter to control censoring - # @return [Numeric] A prediction value. - def self.local_svm_prop(props, activities, min_train_performance) - - $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)." - n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays. - q_prop = props[0] # is an Array. + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 - prediction = nil - if activities.uniq.size == 1 - prediction = activities[0] - else - t = Time.now - #$logger.debug gram_matrix.to_yaml - #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests - @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests - rs = [] - ["caret", "doMC", "class"].each do |lib| - #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))" - rs << "suppressPackageStartupMessages(library('#{lib}'))" + activities = [] + weights = [] + physchem = {} + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] # TODO cosine ? + neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + physchem[pid] ||= [] + physchem[pid] << v + end + end end - #@r.eval "registerDoMC()" # switch on parallel processing - rs << "registerDoMC()" # switch on parallel processing - #@r.eval "set.seed(1)" - rs << "set.seed(1)" - $logger.debug "Loading R packages: #{Time.now-t}" - t = Time.now - p n_prop - begin - - # set data - rs << "n_prop <- c(#{n_prop.flatten.join(',')})" - rs << "n_prop <- c(#{n_prop.flatten.join(',')})" - rs << "n_prop_x_size <- c(#{n_prop.size})" - rs << "n_prop_y_size <- c(#{n_prop[0].size})" - rs << "y <- c(#{activities.join(',')})" - rs << "q_prop <- c(#{q_prop.join(',')})" - rs << "y = matrix(y)" - rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" - rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)" - - $logger.debug "Setting R data: #{Time.now-t}" - t = Time.now - # prepare data - rs << " - weights=NULL - if (!(class(y) == 'numeric')) { - y = factor(y) - weights=unlist(as.list(prop.table(table(y)))) - weights=(weights-1)^2 - } - " - - rs << " - rem = nearZeroVar(prop_matrix) - if (length(rem) > 0) { - prop_matrix = prop_matrix[,-rem,drop=F] - q_prop = q_prop[,-rem,drop=F] - } - rem = findCorrelation(cor(prop_matrix)) - if (length(rem) > 0) { - prop_matrix = prop_matrix[,-rem,drop=F] - q_prop = q_prop[,-rem,drop=F] - } - " + end - #p @r.eval("y").to_ruby - #p "weights" - #p @r.eval("weights").to_ruby - $logger.debug "Preparing R data: #{Time.now-t}" - t = Time.now - # model + support vectors - #train_success = @r.eval <<-EOR - rs << ' - model = train(prop_matrix,y, - method="svmRadial", - preProcess=c("center", "scale"), - class.weights=weights, - trControl=trainControl(method="LGOCV",number=10), - tuneLength=8 - ) - perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) - ' - File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")} - p rs.join("\n") - p `Rscript /tmp/r.r` -=begin - @r.void_eval <<-EOR - model = train(prop_matrix,y, - method="svmRadial", - #preProcess=c("center", "scale"), - #class.weights=weights, - #trControl=trainControl(method="LGOCV",number=10), - #tuneLength=8 - ) - perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) - EOR -=end + # remove properties with a single value + physchem.each do |pid,v| + physchem.delete(pid) if v.uniq.size <= 1 + end - $logger.debug "Creating R SVM model: #{Time.now-t}" - t = Time.now - if train_success - # prediction - @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice - #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice - @r.eval "if (class(y)!='numeric') p = as.character(p)" - prediction = @r.p + if physchem.empty? + result = local_weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result - # censoring - prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) - prediction = nil if prediction =~ /NA/ - $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" - else - $logger.debug "Model creation failed." - prediction = nil - end - $logger.debug "R Prediction: #{Time.now-t}" - rescue Exception => e - $logger.debug "#{e.class}: #{e.message}" - $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - ensure - #puts @r.inspect - #TODO: broken pipe - #@r.quit # free R + else + data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + if prediction.nil? + prediction = local_weighted_average(compound, params) + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return prediction + else + prediction[:value] = 10**prediction[:value] + prediction end end - prediction + end - end + def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values + R.assign "weights", training_weights + r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", training_features + R.eval "names(data) <- append(c('activities'),features)" # + begin + R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" + rescue + return nil + end + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + R.eval "names(fingerprint) <- features" + R.eval "prediction <- predict(model,fingerprint)" + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } + end + + end end end |