From 753fcc204d93d86c76860bee6e2f7d0468c3c940 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 14 Apr 2016 19:43:24 +0200 Subject: features/toxicities fixed --- lib/regression.rb | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 5021fb3..cb17f25 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,8 +9,8 @@ module OpenTox neighbors = params[:neighbors] neighbors.each do |row| sim = row["tanimoto"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) sim_sum += sim end @@ -32,8 +32,8 @@ module OpenTox neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| activities << Math.log10(act) weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -79,21 +79,24 @@ module OpenTox neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] physchem = {} - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] # TODO cosine ? - neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + neighbors.each_with_index do |n,i| + if n["toxicities"][params[:prediction_feature_id].to_s] + n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + # TODO fix!!!! + activities << -Math.log10(act) + #if act.numeric? + #activities << act + n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + neighbor = Substance.find(n["_id"]) + neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity physchem[pid] ||= [] - physchem[pid] << v + physchem[pid] += v end end end @@ -110,8 +113,8 @@ module OpenTox return result else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }} + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]} if prediction.nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." @@ -127,6 +130,8 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + #p r_data_frame + File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"} R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # -- cgit v1.2.3 From cfc64a2966ab38698e499f0b44f41208ee77a07f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 26 Apr 2016 17:38:15 +0200 Subject: first nanomaterial prediction --- lib/regression.rb | 99 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 30 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index cb17f25..5610a77 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -75,46 +75,62 @@ module OpenTox end - def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" + def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" + + neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities - neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] - physchem = {} + pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq + data_frame = [] + data_frame[0] = [] neighbors.each_with_index do |n,i| - if n["toxicities"][params[:prediction_feature_id].to_s] - n["toxicities"][params[:prediction_feature_id].to_s].each do |act| - # TODO fix!!!! - activities << -Math.log10(act) - #if act.numeric? - #activities << act - n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor = Substance.find(n["_id"]) - neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity - physchem[pid] ||= [] - physchem[pid] += v - end + neighbor = Substance.find(n["_id"]) + n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + data_frame[0][i] = act + n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + neighbor.physchem_descriptors.each do |pid,values| + values.uniq! + warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1 + j = pc_ids.index(pid)+1 + data_frame[j] ||= [] + data_frame[j][i] = values.for_R end end + (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + data_frame[j] ||= [] + data_frame[j][i] ||= "NA" + end end - - # remove properties with a single value - physchem.each do |pid,v| - physchem.delete(pid) if v.uniq.size <= 1 + remove_idx = [] + data_frame.each_with_index do |r,i| + remove_idx << i if r.uniq.size == 1 # remove properties with a single value + end + remove_idx.reverse.each do |i| + data_frame.delete_at i + pc_ids.delete_at i end - if physchem.empty? + if pc_ids.empty? result = local_weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result - else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }} - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]} + query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R} + remove_idx = [] + query_descriptors.each_with_index do |v,i| + remove_idx << i if v == "NA" + end + remove_idx.reverse.each do |i| + data_frame.delete_at i + pc_ids.delete_at i + query_descriptors.delete_at i + end + prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." @@ -130,16 +146,39 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" - #p r_data_frame - File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"} +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) + File.open("tmp.R","w+"){|f| + f.puts "suppressPackageStartupMessages({ + library(iterators,lib=\"#{rlib}\") + library(foreach,lib=\"#{rlib}\") + library(ggplot2,lib=\"#{rlib}\") + library(grid,lib=\"#{rlib}\") + library(gridExtra,lib=\"#{rlib}\") + library(pls,lib=\"#{rlib}\") + library(caret,lib=\"#{rlib}\") + library(doMC,lib=\"#{rlib}\") + registerDoMC(#{NR_CORES}) +})" + + f.puts "data <- #{r_data_frame}\n" + f.puts "weights <- c(#{training_weights.join(', ')})" + f.puts "features <- c(#{training_features.join(', ')})" + f.puts "names(data) <- append(c('activities'),features)" # + f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" + f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + f.puts "names(fingerprint) <- features" + f.puts "prediction <- predict(model,fingerprint)" + } + R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # - begin - R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" - rescue - return nil - end + #begin + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" + #rescue + #return nil + #end + p query_feature_values R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" -- cgit v1.2.3 From 32d767ee7cfcc19337892551906950621f348174 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 08:11:12 +0200 Subject: nanoparticle crossvalidation technically working --- lib/regression.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 5610a77..3a59c14 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,6 +9,7 @@ module OpenTox neighbors = params[:neighbors] neighbors.each do |row| sim = row["tanimoto"] + sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) @@ -120,7 +121,7 @@ module OpenTox result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else - query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R} + query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact remove_idx = [] query_descriptors.each_with_index do |v,i| remove_idx << i if v == "NA" @@ -172,13 +173,9 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) R.eval "data <- #{r_data_frame}" R.assign "features", training_features - R.eval "names(data) <- append(c('activities'),features)" # - #begin + begin + R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" - #rescue - #return nil - #end - p query_feature_values R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" @@ -187,6 +184,9 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, } + rescue + return nil + end end end -- cgit v1.2.3 From 79238bddb59607aa9f759caa9e3c8db176709703 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 12:19:48 +0200 Subject: compound validations fixed --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 3a59c14..694a2dc 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -85,7 +85,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq data_frame = [] data_frame[0] = [] -- cgit v1.2.3 From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 May 2016 19:24:42 +0200 Subject: first reasonable results for nanoparticle crossvalidation --- lib/regression.rb | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 694a2dc..d2c4e91 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -12,16 +12,15 @@ module OpenTox sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) + weighted_sum += sim*act sim_sum += sim end end end - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - # TODO explicit neighbors, also for physchem def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -35,7 +34,7 @@ module OpenTox fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) + activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| fingerprints[id] ||= [] @@ -67,9 +66,9 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] - prediction[:value] = 10**prediction[:value] - prediction[:rmse] = 10**prediction[:rmse] + prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] + prediction[:value] = prediction[:value] + prediction[:rmse] = prediction[:rmse] prediction end end @@ -96,7 +95,7 @@ module OpenTox n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| values.uniq! - warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1 + warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 data_frame[j] ||= [] data_frame[j][i] = values.for_R @@ -121,7 +120,9 @@ module OpenTox result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else - query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact + query_descriptors = pc_ids.collect do |i| + compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + end remove_idx = [] query_descriptors.each_with_index do |v,i| remove_idx << i if v == "NA" @@ -137,7 +138,6 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:value] = 10**prediction[:value] prediction end end @@ -148,6 +148,7 @@ module OpenTox R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) +=begin File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } +=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, - } + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + R.eval "names(fingerprint) <- features" + R.eval "prediction <- predict(model,fingerprint)" + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } rescue return nil end -- cgit v1.2.3 From ab7b37541b4f8a762be737009631d3eefd898b4a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 5 May 2016 16:14:02 +0200 Subject: ambit mirror, import from mirrored json, proteomics import --- lib/regression.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index d2c4e91..fe45f99 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -84,7 +84,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq data_frame = [] data_frame[0] = [] @@ -93,7 +93,7 @@ module OpenTox n["toxicities"][params[:prediction_feature_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor.physchem_descriptors.each do |pid,values| + neighbor.physchem.each do |pid,values| values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 @@ -121,7 +121,7 @@ module OpenTox return result else query_descriptors = pc_ids.collect do |i| - compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| -- cgit v1.2.3 From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 6 May 2016 12:49:28 +0200 Subject: dataset tests cleanup --- lib/regression.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index fe45f99..d2c4e91 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -84,7 +84,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq data_frame = [] data_frame[0] = [] @@ -93,7 +93,7 @@ module OpenTox n["toxicities"][params[:prediction_feature_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor.physchem.each do |pid,values| + neighbor.physchem_descriptors.each do |pid,values| values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 @@ -121,7 +121,7 @@ module OpenTox return result else query_descriptors = pc_ids.collect do |i| - compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA" + compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| -- cgit v1.2.3 From 06fc914653face2c58fd4e6c47161cb03e217582 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 8 May 2016 12:22:58 +0200 Subject: default validations fixed --- lib/regression.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index d2c4e91..13e1380 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -11,7 +11,7 @@ module OpenTox sim = row["tanimoto"] sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| weighted_sum += sim*act sim_sum += sim end @@ -33,7 +33,7 @@ module OpenTox neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -77,10 +77,10 @@ module OpenTox def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" - neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities + neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] @@ -90,7 +90,7 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| -- cgit v1.2.3 From 7794086d367fb256c3673d7578b23ec2fb83e6ed Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 14:05:29 +0200 Subject: physchem crossvalidation fixed --- lib/regression.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 13e1380..b8a7e5f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -80,7 +80,7 @@ module OpenTox neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] @@ -94,6 +94,7 @@ module OpenTox data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| + values = [values] if values.is_a? Float values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 -- cgit v1.2.3 From 611bac891177f8d9185d45486dd574b6ef4d1912 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 15:11:46 +0200 Subject: nanoparticle models fixed --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index b8a7e5f..691f903 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -10,7 +10,7 @@ module OpenTox neighbors.each do |row| sim = row["tanimoto"] sim ||= 1 # TODO: sim f nanoparticles - if row["toxicities"][params[:prediction_feature_id].to_s] + if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| weighted_sum += sim*act sim_sum += sim -- cgit v1.2.3 From c1be8fe66f640d44dbbc9bfe5212733994bfb9c5 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 15:44:29 +0200 Subject: physchem crossvalidation fixed, test_compound_descriptor_parameters assertions fixed --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 691f903..2eaae73 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -94,7 +94,7 @@ module OpenTox data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| - values = [values] if values.is_a? Float + values = [values] unless values.is_a? Array values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 -- cgit v1.2.3 From b8bb12c8a163c238d7d4387c1914e2100bb660df Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 12 May 2016 15:23:01 +0200 Subject: enm study import fixed --- lib/regression.rb | 102 +++++++++++++++++++++++++----------------------------- 1 file changed, 47 insertions(+), 55 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 2eaae73..9d305a6 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,49 +3,43 @@ module OpenTox class Regression - def self.local_weighted_average compound, params + def self.local_weighted_average substance, neighbors weighted_sum = 0.0 sim_sum = 0.0 - neighbors = params[:neighbors] - neighbors.each do |row| - sim = row["tanimoto"] - sim ||= 1 # TODO: sim f nanoparticles - if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| - weighted_sum += sim*act - sim_sum += sim - end - end + neighbors.each do |neighbor| + sim = neighbor["similarity"] + activities = neighbor["toxicities"] + activities.each do |act| + weighted_sum += sim*act + sim_sum += sim + end if activities end sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" - neighbors = params[:neighbors] - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - activities = [] + def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05" + values = [] fingerprints = {} weights = [] - fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort - - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - fingerprint = neighbor.fingerprint - if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| - activities << act - weights << row["tanimoto"] - fingerprint_ids.each_with_index do |id,j| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) - end + fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each do |n| + fingerprint = Substance.find(n["_id"]).fingerprint + activities = n["toxicities"] + activities.each do |act| + values << act + weights << n["similarity"] + fingerprint_ids.each do |id| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) end - end + end if activities end variables = [] - data_frame = [activities] + data_frame = [values] + fingerprints.each do |k,v| unless v.uniq.size == 1 data_frame << v.collect{|m| m ? "T" : "F"} @@ -54,17 +48,16 @@ module OpenTox end if variables.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result - + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction else - compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction method, data_frame, variables, weights, compound_features + substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} + prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." + prediction else prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] prediction[:value] = prediction[:value] @@ -75,13 +68,10 @@ module OpenTox end - def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" - - neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities - - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" + def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" + #dataset = Dataset.find dataset_id activities = [] weights = [] pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq @@ -90,9 +80,11 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| + activities = neighbor["toxicities"] + activities.each do |act| data_frame[0][i] = act - n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + # TODO: update with cosine similarity for physchem + weights << n["similarity"] neighbor.physchem_descriptors.each do |pid,values| values = [values] unless values.is_a? Array values.uniq! @@ -101,7 +93,7 @@ module OpenTox data_frame[j] ||= [] data_frame[j][i] = values.for_R end - end + end if activities (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" @@ -117,12 +109,12 @@ module OpenTox end if pc_ids.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction else query_descriptors = pc_ids.collect do |i| - compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| @@ -135,9 +127,9 @@ module OpenTox end prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." + prediction else prediction end -- cgit v1.2.3 From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 27 May 2016 19:16:16 +0200 Subject: first correlation of nanoparticle predictions --- lib/regression.rb | 2 -- 1 file changed, 2 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 9d305a6..6487557 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -71,7 +71,6 @@ module OpenTox #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" - #dataset = Dataset.find dataset_id activities = [] weights = [] pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq @@ -83,7 +82,6 @@ module OpenTox activities = neighbor["toxicities"] activities.each do |act| data_frame[0][i] = act - # TODO: update with cosine similarity for physchem weights << n["similarity"] neighbor.physchem_descriptors.each do |pid,values| values = [values] unless values.is_a? Array -- cgit v1.2.3 From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 31 May 2016 18:08:08 +0200 Subject: cleanup of validation modules/classes --- lib/regression.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 6487557..cffcbbf 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -8,7 +8,7 @@ module OpenTox sim_sum = 0.0 neighbors.each do |neighbor| sim = neighbor["similarity"] - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| weighted_sum += sim*act sim_sum += sim @@ -26,7 +26,7 @@ module OpenTox neighbors.each do |n| fingerprint = Substance.find(n["_id"]).fingerprint - activities = n["toxicities"] + activities = n["measurements"] activities.each do |act| values << act weights << n["similarity"] @@ -79,7 +79,7 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| data_frame[0][i] = act weights << n["similarity"] -- cgit v1.2.3 From 85f2308c101b4778508c2d767e08af4cfd671b7b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 12:22:39 +0200 Subject: local pls regression for nanoparticles --- lib/regression.rb | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index cffcbbf..5028c78 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -73,23 +73,19 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort data_frame = [] data_frame[0] = [] neighbors.each_with_index do |n,i| - neighbor = Substance.find(n["_id"]) - activities = neighbor["measurements"] + activities = n["measurements"] activities.each do |act| data_frame[0][i] = act weights << n["similarity"] - neighbor.physchem_descriptors.each do |pid,values| - values = [values] unless values.is_a? Array - values.uniq! - warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 - j = pc_ids.index(pid)+1 + n["common_descriptors"].each do |d| + j = pc_ids.index(d[:id])+1 data_frame[j] ||= [] - data_frame[j][i] = values.for_R + data_frame[j][i] = d[:scaled_value] end end if activities (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA @@ -97,10 +93,12 @@ module OpenTox data_frame[j][i] ||= "NA" end end + remove_idx = [] data_frame.each_with_index do |r,i| remove_idx << i if r.uniq.size == 1 # remove properties with a single value end + remove_idx.reverse.each do |i| data_frame.delete_at i pc_ids.delete_at i @@ -112,7 +110,7 @@ module OpenTox prediction else query_descriptors = pc_ids.collect do |i| - substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA" + substance.scaled_values[i] ? substance.scaled_values[i] : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| @@ -127,10 +125,9 @@ module OpenTox if prediction.nil? prediction = local_weighted_average substance, neighbors prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - prediction - else - prediction end + p prediction + prediction end end @@ -172,10 +169,15 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval } rescue return nil -- cgit v1.2.3 From eec5bddbd35c9ecee8021128508d8718bccb4fe3 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 17:54:48 +0200 Subject: local pls regression for nanoparticle proteomics --- lib/regression.rb | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 5028c78..b9067c6 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -88,35 +88,42 @@ module OpenTox data_frame[j][i] = d[:scaled_value] end end if activities - (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + (0..pc_ids.size).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" end end - remove_idx = [] - data_frame.each_with_index do |r,i| - remove_idx << i if r.uniq.size == 1 # remove properties with a single value - end + #remove_idx = [] + #data_frame.each_with_index do |r,i| + #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment + #end - remove_idx.reverse.each do |i| - data_frame.delete_at i - pc_ids.delete_at i - end + #p data_frame.size + #p pc_ids.size + #data_frame.delete_if.with_index { |_, index| remove_idx.include? index } + #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 } + #remove_idx.sort.reverse.each do |i| + #p i + #data_frame.delete_at i + #pc_ids.delete_at i + #end + #p data_frame.size + #p pc_ids.size if pc_ids.empty? prediction = local_weighted_average substance, neighbors prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else - query_descriptors = pc_ids.collect do |i| - substance.scaled_values[i] ? substance.scaled_values[i] : "NA" - end + query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } remove_idx = [] query_descriptors.each_with_index do |v,i| - remove_idx << i if v == "NA" + #remove_idx << i if v == "NA" + remove_idx << i unless v end - remove_idx.reverse.each do |i| + remove_idx.sort.reverse.each do |i| data_frame.delete_at i pc_ids.delete_at i query_descriptors.delete_at i @@ -135,8 +142,9 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" -rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) =begin +=end +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -159,10 +167,11 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } -=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features + p training_features.size + p R.eval("names(data)").to_ruby.size begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" -- cgit v1.2.3 From 290c7f86950c4051d018b8019ff4e72ec406c58c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 3 Jun 2016 19:15:36 +0200 Subject: random forest regression --- lib/regression.rb | 63 +++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 35 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index b9067c6..c4c83d2 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.local_weighted_average substance, neighbors + def self.local_weighted_average substance:, neighbors: weighted_sum = 0.0 sim_sum = 0.0 neighbors.each do |neighbor| @@ -18,7 +18,7 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05" + def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -68,8 +68,7 @@ module OpenTox end - #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" - def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" + def self.local_physchem_regression substance:, neighbors:, method: pls activities = [] weights = [] @@ -88,46 +87,39 @@ module OpenTox data_frame[j][i] = d[:scaled_value] end end if activities - #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA (0..pc_ids.size).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" end end - #remove_idx = [] - #data_frame.each_with_index do |r,i| - #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment - #end - - #p data_frame.size - #p pc_ids.size - #data_frame.delete_if.with_index { |_, index| remove_idx.include? index } - #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 } - #remove_idx.sort.reverse.each do |i| - #p i - #data_frame.delete_at i - #pc_ids.delete_at i - #end - #p data_frame.size - #p pc_ids.size + data_frame = data_frame.each_with_index.collect do |r,i| + if r.uniq.size == 1 # remove properties with a single value + r = nil + pc_ids[i-1] = nil # data_frame frame has additional activity entry + end + r + end + data_frame.compact! + pc_ids.compact! if pc_ids.empty? prediction = local_weighted_average substance, neighbors - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } - remove_idx = [] - query_descriptors.each_with_index do |v,i| - #remove_idx << i if v == "NA" - remove_idx << i unless v - end - remove_idx.sort.reverse.each do |i| - data_frame.delete_at i - pc_ids.delete_at i - query_descriptors.delete_at i + query_descriptors = query_descriptors.each_with_index.collect do |v,i| + unless v + v = nil + data_frame[i] = nil + pc_ids[i] = nil + end + v end + query_descriptors.compact! + data_frame.compact! + pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? prediction = local_weighted_average substance, neighbors @@ -143,7 +135,6 @@ module OpenTox R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" =begin -=end rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ @@ -162,19 +153,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "weights <- c(#{training_weights.join(', ')})" f.puts "features <- c(#{training_features.join(', ')})" f.puts "names(data) <- append(c('activities'),features)" # + f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)" + f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)" + f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } +=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features - p training_features.size - p R.eval("names(data)").to_ruby.size begin R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" -- cgit v1.2.3 From f7e87b45f15083e5fcdea64821f06ed93ece4c4e Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 7 Jun 2016 18:07:28 +0200 Subject: (repeated)crossvalidation plots --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index c4c83d2..51317ac 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -122,7 +122,7 @@ module OpenTox pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." end p prediction -- cgit v1.2.3 From 0f31c884d1bcfa448a1bf43a41d8fd6cf88bfc52 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 8 Jun 2016 18:26:07 +0200 Subject: compound tests fixed --- lib/regression.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 51317ac..d034d0b 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -18,7 +18,7 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05" + def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -55,7 +55,7 @@ module OpenTox substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." prediction else @@ -68,7 +68,7 @@ module OpenTox end - def self.local_physchem_regression substance:, neighbors:, method: pls + def self.local_physchem_regression substance:, neighbors:, method: "pls" activities = [] weights = [] -- cgit v1.2.3 From 46c628f1757ce8274a0b277b3ec3306609b38c14 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 25 Jul 2016 15:53:22 +0200 Subject: local_weighted_average fallback fixed, cv predictions pulled from validations to avoid mongo document size errors --- lib/regression.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index d034d0b..269a743 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -48,7 +48,7 @@ module OpenTox end if variables.empty? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else @@ -104,7 +104,7 @@ module OpenTox pc_ids.compact! if pc_ids.empty? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else -- cgit v1.2.3 From 5d4e5e463c2b87241bbb56e4658e1e26c0ed084f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 5 Oct 2016 13:22:12 +0200 Subject: substance and nanoparticle model creation and predictions --- lib/regression.rb | 67 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 7 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 269a743..396c9e4 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,8 @@ module OpenTox class Regression - def self.local_weighted_average substance:, neighbors: + def self.weighted_average descriptors:nil, neighbors:, parameters:nil + # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 neighbors.each do |neighbor| @@ -18,7 +19,57 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" + def self.caret descriptors:, neighbors:, method: "pls", parameters:nil + values = [] + descriptors = {} + weights = [] + descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort + + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| + values << act + weights << n["similarity"] + descriptor_ids.each do |id| + descriptors[id] ||= [] + descriptors[id] << n["descriptors"].include?(id) + end + end if activities + end + + variables = [] + data_frame = [values] + + descriptors.each do |k,v| + unless v.uniq.size == 1 + data_frame << v.collect{|m| m ? "T" : "F"} + variables << k + end + end + + if variables.empty? + prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction + else + substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} + #puts data_frame.to_yaml + prediction = r_model_prediction method, data_frame, variables, weights, substance_features + if prediction.nil? or prediction[:value].nil? + prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) + prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + prediction + else + prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] + prediction[:value] = prediction[:value] + prediction[:rmse] = prediction[:rmse] + prediction + end + end + + end + + def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -48,14 +99,14 @@ module OpenTox end if variables.empty? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." prediction else @@ -68,7 +119,8 @@ module OpenTox end - def self.local_physchem_regression substance:, neighbors:, method: "pls" +=begin + def self.physchem_regression substance:, neighbors:, method: "pls" activities = [] weights = [] @@ -104,7 +156,7 @@ module OpenTox pc_ids.compact! if pc_ids.empty? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else @@ -122,7 +174,7 @@ module OpenTox pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." end p prediction @@ -130,6 +182,7 @@ module OpenTox end end +=end def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights -- cgit v1.2.3 From 4348eec89033e6677c9f628646fc67bd03c73fe6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 6 Oct 2016 19:14:10 +0200 Subject: nano caret regression fixed --- lib/regression.rb | 220 ------------------------------------------------------ 1 file changed, 220 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 396c9e4..cf6d9cb 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -19,226 +19,6 @@ module OpenTox {:value => prediction} end - def self.caret descriptors:, neighbors:, method: "pls", parameters:nil - values = [] - descriptors = {} - weights = [] - descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort - - neighbors.each do |n| - activities = n["measurements"] - activities.each do |act| - values << act - weights << n["similarity"] - descriptor_ids.each do |id| - descriptors[id] ||= [] - descriptors[id] << n["descriptors"].include?(id) - end - end if activities - end - - variables = [] - data_frame = [values] - - descriptors.each do |k,v| - unless v.uniq.size == 1 - data_frame << v.collect{|m| m ? "T" : "F"} - variables << k - end - end - - if variables.empty? - prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." - prediction - else - substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} - #puts data_frame.to_yaml - prediction = r_model_prediction method, data_frame, variables, weights, substance_features - if prediction.nil? or prediction[:value].nil? - prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) - prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." - prediction - else - prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] - prediction[:value] = prediction[:value] - prediction[:rmse] = prediction[:rmse] - prediction - end - end - - end - - def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" - values = [] - fingerprints = {} - weights = [] - fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort - - neighbors.each do |n| - fingerprint = Substance.find(n["_id"]).fingerprint - activities = n["measurements"] - activities.each do |act| - values << act - weights << n["similarity"] - fingerprint_ids.each do |id| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) - end - end if activities - end - - variables = [] - data_frame = [values] - - fingerprints.each do |k,v| - unless v.uniq.size == 1 - data_frame << v.collect{|m| m ? "T" : "F"} - variables << k - end - end - - if variables.empty? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." - prediction - else - substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction method, data_frame, variables, weights, substance_features - if prediction.nil? or prediction[:value].nil? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - prediction - else - prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] - prediction[:value] = prediction[:value] - prediction[:rmse] = prediction[:rmse] - prediction - end - end - - end - -=begin - def self.physchem_regression substance:, neighbors:, method: "pls" - - activities = [] - weights = [] - pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort - data_frame = [] - data_frame[0] = [] - - neighbors.each_with_index do |n,i| - activities = n["measurements"] - activities.each do |act| - data_frame[0][i] = act - weights << n["similarity"] - n["common_descriptors"].each do |d| - j = pc_ids.index(d[:id])+1 - data_frame[j] ||= [] - data_frame[j][i] = d[:scaled_value] - end - end if activities - (0..pc_ids.size).each do |j| # for R: fill empty values with NA - data_frame[j] ||= [] - data_frame[j][i] ||= "NA" - end - end - - data_frame = data_frame.each_with_index.collect do |r,i| - if r.uniq.size == 1 # remove properties with a single value - r = nil - pc_ids[i-1] = nil # data_frame frame has additional activity entry - end - r - end - data_frame.compact! - pc_ids.compact! - - if pc_ids.empty? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." - prediction - else - query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } - query_descriptors = query_descriptors.each_with_index.collect do |v,i| - unless v - v = nil - data_frame[i] = nil - pc_ids[i] = nil - end - v - end - query_descriptors.compact! - data_frame.compact! - pc_ids.compact! - prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors - if prediction.nil? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - end - p prediction - prediction - end - - end -=end - - def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values - R.assign "weights", training_weights - r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" -=begin -rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) - File.open("tmp.R","w+"){|f| - f.puts "suppressPackageStartupMessages({ - library(iterators,lib=\"#{rlib}\") - library(foreach,lib=\"#{rlib}\") - library(ggplot2,lib=\"#{rlib}\") - library(grid,lib=\"#{rlib}\") - library(gridExtra,lib=\"#{rlib}\") - library(pls,lib=\"#{rlib}\") - library(caret,lib=\"#{rlib}\") - library(doMC,lib=\"#{rlib}\") - registerDoMC(#{NR_CORES}) -})" - - f.puts "data <- #{r_data_frame}\n" - f.puts "weights <- c(#{training_weights.join(', ')})" - f.puts "features <- c(#{training_features.join(', ')})" - f.puts "names(data) <- append(c('activities'),features)" # - f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)" - f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)" - - f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" - f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - f.puts "names(fingerprint) <- features" - f.puts "prediction <- predict(model,fingerprint)" - } -=end - - R.eval "data <- #{r_data_frame}" - R.assign "features", training_features - begin - R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - value = R.eval("prediction").to_f - rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f - r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f - prediction_interval = value-1.96*rmse, value+1.96*rmse - { - :value => value, - :rmse => rmse, - :r_squared => r_squared, - :prediction_interval => prediction_interval - } - rescue - return nil - end - end - end end end -- cgit v1.2.3 From 91787edb3682900bc5a2feeca66e5142f387fcc6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 7 Oct 2016 10:25:58 +0200 Subject: unified interface for prediction algorithms --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index cf6d9cb..0e5e06b 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.weighted_average descriptors:nil, neighbors:, parameters:nil + def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3 From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 12 Oct 2016 21:32:27 +0200 Subject: physchem regression, correlation_filter for fingerprints --- lib/regression.rb | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index 0e5e06b..bed6df8 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,18 +3,15 @@ module OpenTox class Regression - def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil + def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: + #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 - neighbors.each do |neighbor| - sim = neighbor["similarity"] - activities = neighbor["measurements"] - activities.each do |act| - weighted_sum += sim*act - sim_sum += sim - end if activities - end + dependent_variables.each_with_index do |v,i| + weighted_sum += weights[i]*dependent_variables[i] + sim_sum += weights[i] + end if dependent_variables sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end -- cgit v1.2.3 From 8d325866dd7cacdd04bd2306a9144a5e7300c7c8 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 10:11:09 +0200 Subject: molecular_weight fixed --- lib/regression.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index bed6df8..d1724fd 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -4,7 +4,6 @@ module OpenTox class Regression def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: - #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3 From 09452bba5c407c27721223d126e3f45c12b20a0c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 22:59:45 +0200 Subject: tests pass --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/regression.rb') diff --git a/lib/regression.rb b/lib/regression.rb index d1724fd..3890987 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: + def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3