From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 May 2016 19:24:42 +0200 Subject: first reasonable results for nanoparticle crossvalidation --- lib/crossvalidation.rb | 4 +- lib/model.rb | 101 +++++++++++++++++++------------------------ lib/nanoparticle.rb | 18 ++++++-- lib/regression.rb | 38 ++++++++-------- lib/validation-statistics.rb | 7 ++- scripts/import-enm.rb | 6 +++ scripts/mg2mmol.rb | 17 ++++++++ scripts/mmol2-log10.rb | 17 ++++++++ test/nanoparticles.rb | 23 +++++++++- 9 files changed, 146 insertions(+), 85 deletions(-) create mode 100644 scripts/import-enm.rb create mode 100644 scripts/mg2mmol.rb create mode 100644 scripts/mmol2-log10.rb diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 0ae36c4..e1f956b 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -141,7 +141,7 @@ module OpenTox :measured => p[1], :predicted => p[2], #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, - :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], :neighbors => neighbors @@ -152,7 +152,7 @@ module OpenTox def confidence_plot tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact + sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact R.assign "error", sorted_predictions.collect{|p| p[0]} R.assign "confidence", sorted_predictions.collect{|p| p[1]} # TODO fix axis names diff --git a/lib/model.rb b/lib/model.rb index f61368e..841ab20 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -2,7 +2,7 @@ module OpenTox module Model - class Model + class Lazar include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -10,27 +10,13 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ - # datasets field :training_dataset_id, type: BSON::ObjectId - # algorithms field :prediction_algorithm, type: String - # prediction feature field :prediction_feature_id, type: BSON::ObjectId - - def training_dataset - Dataset.find(training_dataset_id) - end - - def prediction_feature - Feature.find(prediction_feature_id) - end - end - - class Lazar < Model - - # algorithms field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash, default: {} + field :feature_selection_algorithm, type: String + field :relevant_features, type: Hash # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset @@ -45,10 +31,43 @@ module OpenTox self.name ||= "#{training_dataset.name} #{prediction_feature.name}" self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + + Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm save self end + def correlation_filter + toxicities = [] + substances = [] + training_dataset.substances.each do |s| + s["toxicities"][prediction_feature_id].each do |act| + toxicities << act + substances << s + end + end + R.assign "tox", toxicities + feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + R.assign "feature", feature_values + begin + #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature] = {} + relevant_features[feature]["pvalue"] = pvalue + relevant_features[feature]["r"] = r + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + end + end + relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + end + def predict_compound compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature @@ -63,7 +82,6 @@ module OpenTox prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -123,6 +141,14 @@ module OpenTox end + def training_dataset + Dataset.find(training_dataset_id) + end + + def prediction_feature + Feature.find(prediction_feature_id) + end + end class LazarClassification < Lazar @@ -229,45 +255,6 @@ module OpenTox end end - class NanoLazar - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "models" - - field :name, type: String - field :creator, type: String, default: __FILE__ - # datasets - field :training_dataset_id, type: BSON::ObjectId - # algorithms - field :prediction_algorithm, type: String - # prediction feature - field :prediction_feature_id, type: BSON::ObjectId - field :training_particle_ids, type: Array - - def self.create_all - nanoparticles = Nanoparticle.all - toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} - tox = {} - toxfeatures.each do |t| - tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} - end - tox.select!{|t,nps| nps.size > 50} - tox.collect do |t,nps| - find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) - end - end - - def predict nanoparticle - training = training_particle_ids.collect{|id| Nanoparticle.find id} - training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq - query_features = nanoparticle.physchem_descriptors.keys - common_features = (training_features & query_features) - #p common_features - end - - end - end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 83b97a9..dda4a9f 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,7 +8,7 @@ module OpenTox field :bundles, type: Array, default: [] def nanoparticle_neighbors params - Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| {"_id" => np.id, "tanimoto" => 1}} + Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np} end def add_feature feature, value @@ -19,7 +19,19 @@ module OpenTox physchem_descriptors[feature.id.to_s].uniq! when "TOX" toxicities[feature.id.to_s] ||= [] - toxicities[feature.id.to_s] << value + # TODO generic way of parsing TOX values + if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" + toxicities[feature.id.to_s] << -Math.log10(value) + #if value.numeric? + #begin + #rescue + #p feature + #p value + #exit + #end + else + toxicities[feature.id.to_s] << value + end toxicities[feature.id.to_s].uniq! else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." @@ -29,7 +41,7 @@ module OpenTox def parse_ambit_value feature, v v.delete "unit" - # TODO: mmol/log10 conversion + # TODO: ppm instead of weights if v.keys == ["textValue"] add_feature feature, v["textValue"] elsif v.keys == ["loValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 694a2dc..d2c4e91 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -12,16 +12,15 @@ module OpenTox sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) + weighted_sum += sim*act sim_sum += sim end end end - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - # TODO explicit neighbors, also for physchem def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -35,7 +34,7 @@ module OpenTox fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) + activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| fingerprints[id] ||= [] @@ -67,9 +66,9 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] - prediction[:value] = 10**prediction[:value] - prediction[:rmse] = 10**prediction[:rmse] + prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] + prediction[:value] = prediction[:value] + prediction[:rmse] = prediction[:rmse] prediction end end @@ -96,7 +95,7 @@ module OpenTox n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| values.uniq! - warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1 + warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 data_frame[j] ||= [] data_frame[j][i] = values.for_R @@ -121,7 +120,9 @@ module OpenTox result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else - query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact + query_descriptors = pc_ids.collect do |i| + compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + end remove_idx = [] query_descriptors.each_with_index do |v,i| remove_idx << i if v == "NA" @@ -137,7 +138,6 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:value] = 10**prediction[:value] prediction end end @@ -148,6 +148,7 @@ module OpenTox R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) +=begin File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } +=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, - } + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + R.eval "names(fingerprint) <- features" + R.eval "prediction <- predict(model,fingerprint)" + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } rescue return nil end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index c6b2a07..b7c95f6 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -63,16 +63,15 @@ module OpenTox end def self.regression predictions - # TODO: prediction intervals rmse = 0 mae = 0 x = [] y = [] predictions.each do |cid,pred| if pred[:value] and pred[:measured] #and pred[:measured] != [nil] - x << -Math.log10(pred[:measured].median) - y << -Math.log10(pred[:value]) - error = Math.log10(pred[:value])-Math.log10(pred[:measured].median) + x << pred[:measured].median + y << pred[:value] + error = pred[:value]-pred[:measured].median rmse += error**2 mae += error.abs else diff --git a/scripts/import-enm.rb b/scripts/import-enm.rb new file mode 100644 index 0000000..9cbe5d4 --- /dev/null +++ b/scripts/import-enm.rb @@ -0,0 +1,6 @@ +require_relative '../lib/lazar' +include OpenTox +$mongo.database.drop +$gridfs = $mongo.database.fs # recreate GridFS indexes +Import::Enanomapper.import +`mongodump -h 127.0.0.1 -d production` diff --git a/scripts/mg2mmol.rb b/scripts/mg2mmol.rb new file mode 100644 index 0000000..dc6b953 --- /dev/null +++ b/scripts/mg2mmol.rb @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby +require_relative '../lazar/lib/lazar' +include OpenTox +newfile = ARGV[0].sub(/.csv/,"_mmol.csv") +p newfile +CSV.open(newfile, "wb") do |csv| + CSV.read(ARGV[0]).each do |line| + smi,mg = line + if mg.numeric? + c = Compound.from_smiles smi + mmol = c.mg_to_mmol mg.to_f + csv << [smi, mmol] + else + csv << [smi, mg.gsub(/mg/,'mmol')] + end + end +end diff --git a/scripts/mmol2-log10.rb b/scripts/mmol2-log10.rb new file mode 100644 index 0000000..0c99a0b --- /dev/null +++ b/scripts/mmol2-log10.rb @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby +require_relative '../lib/lazar' +include OpenTox +newfile = ARGV[0].sub(/.csv/,"_log10.csv") +p newfile +CSV.open(newfile, "wb") do |csv| + CSV.read(ARGV[0]).each do |line| + smi,mmol = line + if mmol.numeric? + c = Compound.from_smiles smi + mmol = -Math.log10(mmol.to_f) + csv << [smi, mmol] + else + csv << [smi, "-log10(#{mmol})"] + end + end +end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 46c6620..7308a83 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -1,8 +1,14 @@ require_relative "setup.rb" + class NanoparticleTest < MiniTest::Test + def setup + `mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}` + end + def test_import + skip dataset_ids = Import::Enanomapper.import assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported" @@ -17,6 +23,7 @@ class NanoparticleTest < MiniTest::Test end def test_summaries + skip features = Feature.all.to_a #p features.collect do |f| #f if f.category == "TOX" @@ -51,6 +58,18 @@ class NanoparticleTest < MiniTest::Test end end + def test_create_model_with_feature_selection + training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)") + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"}) + nanoparticle = training_dataset.nanoparticles[-34] + #p nanoparticle.neighbors + prediction = model.predict nanoparticle + p prediction + #p prediction + refute_nil prediction[:value] + end + def test_create_model training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)") @@ -66,7 +85,9 @@ class NanoparticleTest < MiniTest::Test def test_validate_model training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"}) + #model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"}) + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors"}) + p model cv = RegressionCrossValidation.create model p cv end -- cgit v1.2.3