From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 May 2016 19:24:42 +0200 Subject: first reasonable results for nanoparticle crossvalidation --- lib/crossvalidation.rb | 4 +- lib/model.rb | 101 +++++++++++++++++++------------------------ lib/nanoparticle.rb | 18 ++++++-- lib/regression.rb | 38 ++++++++-------- lib/validation-statistics.rb | 7 ++- 5 files changed, 84 insertions(+), 84 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 0ae36c4..e1f956b 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -141,7 +141,7 @@ module OpenTox :measured => p[1], :predicted => p[2], #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, - :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], :neighbors => neighbors @@ -152,7 +152,7 @@ module OpenTox def confidence_plot tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact + sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact R.assign "error", sorted_predictions.collect{|p| p[0]} R.assign "confidence", sorted_predictions.collect{|p| p[1]} # TODO fix axis names diff --git a/lib/model.rb b/lib/model.rb index f61368e..841ab20 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -2,7 +2,7 @@ module OpenTox module Model - class Model + class Lazar include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -10,27 +10,13 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ - # datasets field :training_dataset_id, type: BSON::ObjectId - # algorithms field :prediction_algorithm, type: String - # prediction feature field :prediction_feature_id, type: BSON::ObjectId - - def training_dataset - Dataset.find(training_dataset_id) - end - - def prediction_feature - Feature.find(prediction_feature_id) - end - end - - class Lazar < Model - - # algorithms field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash, default: {} + field :feature_selection_algorithm, type: String + field :relevant_features, type: Hash # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset @@ -45,10 +31,43 @@ module OpenTox self.name ||= "#{training_dataset.name} #{prediction_feature.name}" self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + + Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm save self end + def correlation_filter + toxicities = [] + substances = [] + training_dataset.substances.each do |s| + s["toxicities"][prediction_feature_id].each do |act| + toxicities << act + substances << s + end + end + R.assign "tox", toxicities + feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + R.assign "feature", feature_values + begin + #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature] = {} + relevant_features[feature]["pvalue"] = pvalue + relevant_features[feature]["r"] = r + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + end + end + relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + end + def predict_compound compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature @@ -63,7 +82,6 @@ module OpenTox prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -123,6 +141,14 @@ module OpenTox end + def training_dataset + Dataset.find(training_dataset_id) + end + + def prediction_feature + Feature.find(prediction_feature_id) + end + end class LazarClassification < Lazar @@ -229,45 +255,6 @@ module OpenTox end end - class NanoLazar - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "models" - - field :name, type: String - field :creator, type: String, default: __FILE__ - # datasets - field :training_dataset_id, type: BSON::ObjectId - # algorithms - field :prediction_algorithm, type: String - # prediction feature - field :prediction_feature_id, type: BSON::ObjectId - field :training_particle_ids, type: Array - - def self.create_all - nanoparticles = Nanoparticle.all - toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} - tox = {} - toxfeatures.each do |t| - tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} - end - tox.select!{|t,nps| nps.size > 50} - tox.collect do |t,nps| - find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) - end - end - - def predict nanoparticle - training = training_particle_ids.collect{|id| Nanoparticle.find id} - training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq - query_features = nanoparticle.physchem_descriptors.keys - common_features = (training_features & query_features) - #p common_features - end - - end - end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 83b97a9..dda4a9f 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,7 +8,7 @@ module OpenTox field :bundles, type: Array, default: [] def nanoparticle_neighbors params - Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| {"_id" => np.id, "tanimoto" => 1}} + Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np} end def add_feature feature, value @@ -19,7 +19,19 @@ module OpenTox physchem_descriptors[feature.id.to_s].uniq! when "TOX" toxicities[feature.id.to_s] ||= [] - toxicities[feature.id.to_s] << value + # TODO generic way of parsing TOX values + if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" + toxicities[feature.id.to_s] << -Math.log10(value) + #if value.numeric? + #begin + #rescue + #p feature + #p value + #exit + #end + else + toxicities[feature.id.to_s] << value + end toxicities[feature.id.to_s].uniq! else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." @@ -29,7 +41,7 @@ module OpenTox def parse_ambit_value feature, v v.delete "unit" - # TODO: mmol/log10 conversion + # TODO: ppm instead of weights if v.keys == ["textValue"] add_feature feature, v["textValue"] elsif v.keys == ["loValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 694a2dc..d2c4e91 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -12,16 +12,15 @@ module OpenTox sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) + weighted_sum += sim*act sim_sum += sim end end end - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - # TODO explicit neighbors, also for physchem def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -35,7 +34,7 @@ module OpenTox fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) + activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| fingerprints[id] ||= [] @@ -67,9 +66,9 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] - prediction[:value] = 10**prediction[:value] - prediction[:rmse] = 10**prediction[:rmse] + prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] + prediction[:value] = prediction[:value] + prediction[:rmse] = prediction[:rmse] prediction end end @@ -96,7 +95,7 @@ module OpenTox n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| values.uniq! - warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1 + warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 data_frame[j] ||= [] data_frame[j][i] = values.for_R @@ -121,7 +120,9 @@ module OpenTox result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else - query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact + query_descriptors = pc_ids.collect do |i| + compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + end remove_idx = [] query_descriptors.each_with_index do |v,i| remove_idx << i if v == "NA" @@ -137,7 +138,6 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:value] = 10**prediction[:value] prediction end end @@ -148,6 +148,7 @@ module OpenTox R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) +=begin File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } +=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, - } + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + R.eval "names(fingerprint) <- features" + R.eval "prediction <- predict(model,fingerprint)" + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } rescue return nil end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index c6b2a07..b7c95f6 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -63,16 +63,15 @@ module OpenTox end def self.regression predictions - # TODO: prediction intervals rmse = 0 mae = 0 x = [] y = [] predictions.each do |cid,pred| if pred[:value] and pred[:measured] #and pred[:measured] != [nil] - x << -Math.log10(pred[:measured].median) - y << -Math.log10(pred[:value]) - error = Math.log10(pred[:value])-Math.log10(pred[:measured].median) + x << pred[:measured].median + y << pred[:value] + error = pred[:value]-pred[:measured].median rmse += error**2 mae += error.abs else -- cgit v1.2.3