summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-05-04 19:24:42 +0200
committerChristoph Helma <helma@in-silico.ch>2016-05-04 19:24:42 +0200
commit05386e748270c337c66f6f379317ea4b25905236 (patch)
tree4b62f1c9449dded9cd7670715a5735f5bc93dd2e
parent79238bddb59607aa9f759caa9e3c8db176709703 (diff)
first reasonable results for nanoparticle crossvalidation
-rw-r--r--lib/crossvalidation.rb4
-rw-r--r--lib/model.rb101
-rw-r--r--lib/nanoparticle.rb18
-rw-r--r--lib/regression.rb38
-rw-r--r--lib/validation-statistics.rb7
-rw-r--r--scripts/import-enm.rb6
-rw-r--r--scripts/mg2mmol.rb17
-rw-r--r--scripts/mmol2-log10.rb17
-rw-r--r--test/nanoparticles.rb23
9 files changed, 146 insertions, 85 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 0ae36c4..e1f956b 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -141,7 +141,7 @@ module OpenTox
:measured => p[1],
:predicted => p[2],
#:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
- :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+ :error => (p[1]-p[2]).abs,
:relative_error => (p[1]-p[2]).abs/p[1],
:confidence => p[3],
:neighbors => neighbors
@@ -152,7 +152,7 @@ module OpenTox
def confidence_plot
tmpfile = "/tmp/#{id.to_s}_confidence.png"
- sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+ sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact
R.assign "error", sorted_predictions.collect{|p| p[0]}
R.assign "confidence", sorted_predictions.collect{|p| p[1]}
# TODO fix axis names
diff --git a/lib/model.rb b/lib/model.rb
index f61368e..841ab20 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -2,7 +2,7 @@ module OpenTox
module Model
- class Model
+ class Lazar
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
@@ -10,27 +10,13 @@ module OpenTox
field :name, type: String
field :creator, type: String, default: __FILE__
- # datasets
field :training_dataset_id, type: BSON::ObjectId
- # algorithms
field :prediction_algorithm, type: String
- # prediction feature
field :prediction_feature_id, type: BSON::ObjectId
-
- def training_dataset
- Dataset.find(training_dataset_id)
- end
-
- def prediction_feature
- Feature.find(prediction_feature_id)
- end
- end
-
- class Lazar < Model
-
- # algorithms
field :neighbor_algorithm, type: String
field :neighbor_algorithm_parameters, type: Hash, default: {}
+ field :feature_selection_algorithm, type: String
+ field :relevant_features, type: Hash
# Create a lazar model from a training_dataset and a feature_dataset
# @param [OpenTox::Dataset] training_dataset
@@ -45,10 +31,43 @@ module OpenTox
self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
self.neighbor_algorithm_parameters ||= {}
self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+
+ Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm
save
self
end
+ def correlation_filter
+ toxicities = []
+ substances = []
+ training_dataset.substances.each do |s|
+ s["toxicities"][prediction_feature_id].each do |act|
+ toxicities << act
+ substances << s
+ end
+ end
+ R.assign "tox", toxicities
+ feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
+ feature_ids.each do |feature_id|
+ feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]}
+ R.assign "feature", feature_values
+ begin
+ #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
+ R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ r = R.eval("cor$estimate").to_ruby
+ relevant_features[feature] = {}
+ relevant_features[feature]["pvalue"] = pvalue
+ relevant_features[feature]["r"] = r
+ end
+ rescue
+ warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed."
+ end
+ end
+ relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+ end
+
def predict_compound compound
neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
# remove neighbors without prediction_feature
@@ -63,7 +82,6 @@ module OpenTox
prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
neighbors.delete_if{|n| n["_id"] == compound.id}
end
- #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] }
if neighbors.empty?
prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
else
@@ -123,6 +141,14 @@ module OpenTox
end
+ def training_dataset
+ Dataset.find(training_dataset_id)
+ end
+
+ def prediction_feature
+ Feature.find(prediction_feature_id)
+ end
+
end
class LazarClassification < Lazar
@@ -229,45 +255,6 @@ module OpenTox
end
end
- class NanoLazar
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: "models"
-
- field :name, type: String
- field :creator, type: String, default: __FILE__
- # datasets
- field :training_dataset_id, type: BSON::ObjectId
- # algorithms
- field :prediction_algorithm, type: String
- # prediction feature
- field :prediction_feature_id, type: BSON::ObjectId
- field :training_particle_ids, type: Array
-
- def self.create_all
- nanoparticles = Nanoparticle.all
- toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id}
- tox = {}
- toxfeatures.each do |t|
- tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s}
- end
- tox.select!{|t,nps| nps.size > 50}
- tox.collect do |t,nps|
- find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id})
- end
- end
-
- def predict nanoparticle
- training = training_particle_ids.collect{|id| Nanoparticle.find id}
- training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq
- query_features = nanoparticle.physchem_descriptors.keys
- common_features = (training_features & query_features)
- #p common_features
- end
-
- end
-
end
end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 83b97a9..dda4a9f 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -8,7 +8,7 @@ module OpenTox
field :bundles, type: Array, default: []
def nanoparticle_neighbors params
- Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| {"_id" => np.id, "tanimoto" => 1}}
+ Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np}
end
def add_feature feature, value
@@ -19,7 +19,19 @@ module OpenTox
physchem_descriptors[feature.id.to_s].uniq!
when "TOX"
toxicities[feature.id.to_s] ||= []
- toxicities[feature.id.to_s] << value
+ # TODO generic way of parsing TOX values
+ if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)"
+ toxicities[feature.id.to_s] << -Math.log10(value)
+ #if value.numeric?
+ #begin
+ #rescue
+ #p feature
+ #p value
+ #exit
+ #end
+ else
+ toxicities[feature.id.to_s] << value
+ end
toxicities[feature.id.to_s].uniq!
else
warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
@@ -29,7 +41,7 @@ module OpenTox
def parse_ambit_value feature, v
v.delete "unit"
- # TODO: mmol/log10 conversion
+ # TODO: ppm instead of weights
if v.keys == ["textValue"]
add_feature feature, v["textValue"]
elsif v.keys == ["loValue"]
diff --git a/lib/regression.rb b/lib/regression.rb
index 694a2dc..d2c4e91 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -12,16 +12,15 @@ module OpenTox
sim ||= 1 # TODO: sim f nanoparticles
if row["toxicities"][params[:prediction_feature_id].to_s]
row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
- weighted_sum += sim*Math.log10(act)
+ weighted_sum += sim*act
sim_sum += sim
end
end
end
- sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+ sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
{:value => prediction}
end
- # TODO explicit neighbors, also for physchem
def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
@@ -35,7 +34,7 @@ module OpenTox
fingerprint = neighbor.fingerprint
if row["toxicities"][params[:prediction_feature_id].to_s]
row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
- activities << Math.log10(act)
+ activities << act
weights << row["tanimoto"]
fingerprint_ids.each_with_index do |id,j|
fingerprints[id] ||= []
@@ -67,9 +66,9 @@ module OpenTox
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
return prediction
else
- prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
- prediction[:value] = 10**prediction[:value]
- prediction[:rmse] = 10**prediction[:rmse]
+ prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
+ prediction[:value] = prediction[:value]
+ prediction[:rmse] = prediction[:rmse]
prediction
end
end
@@ -96,7 +95,7 @@ module OpenTox
n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
neighbor.physchem_descriptors.each do |pid,values|
values.uniq!
- warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1
+ warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
j = pc_ids.index(pid)+1
data_frame[j] ||= []
data_frame[j][i] = values.for_R
@@ -121,7 +120,9 @@ module OpenTox
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
return result
else
- query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact
+ query_descriptors = pc_ids.collect do |i|
+ compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+ end
remove_idx = []
query_descriptors.each_with_index do |v,i|
remove_idx << i if v == "NA"
@@ -137,7 +138,6 @@ module OpenTox
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
return prediction
else
- prediction[:value] = 10**prediction[:value]
prediction
end
end
@@ -148,6 +148,7 @@ module OpenTox
R.assign "weights", training_weights
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
+=begin
File.open("tmp.R","w+"){|f|
f.puts "suppressPackageStartupMessages({
library(iterators,lib=\"#{rlib}\")
@@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
f.puts "names(fingerprint) <- features"
f.puts "prediction <- predict(model,fingerprint)"
}
+=end
R.eval "data <- #{r_data_frame}"
R.assign "features", training_features
begin
R.eval "names(data) <- append(c('activities'),features)" #
R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
- R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
- R.eval "names(fingerprint) <- features"
- R.eval "prediction <- predict(model,fingerprint)"
- {
- :value => R.eval("prediction").to_f,
- :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
- :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
- }
+ R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
+ R.eval "names(fingerprint) <- features"
+ R.eval "prediction <- predict(model,fingerprint)"
+ {
+ :value => R.eval("prediction").to_f,
+ :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
+ :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
+ }
rescue
return nil
end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index c6b2a07..b7c95f6 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -63,16 +63,15 @@ module OpenTox
end
def self.regression predictions
- # TODO: prediction intervals
rmse = 0
mae = 0
x = []
y = []
predictions.each do |cid,pred|
if pred[:value] and pred[:measured] #and pred[:measured] != [nil]
- x << -Math.log10(pred[:measured].median)
- y << -Math.log10(pred[:value])
- error = Math.log10(pred[:value])-Math.log10(pred[:measured].median)
+ x << pred[:measured].median
+ y << pred[:value]
+ error = pred[:value]-pred[:measured].median
rmse += error**2
mae += error.abs
else
diff --git a/scripts/import-enm.rb b/scripts/import-enm.rb
new file mode 100644
index 0000000..9cbe5d4
--- /dev/null
+++ b/scripts/import-enm.rb
@@ -0,0 +1,6 @@
+require_relative '../lib/lazar'
+include OpenTox
+$mongo.database.drop
+$gridfs = $mongo.database.fs # recreate GridFS indexes
+Import::Enanomapper.import
+`mongodump -h 127.0.0.1 -d production`
diff --git a/scripts/mg2mmol.rb b/scripts/mg2mmol.rb
new file mode 100644
index 0000000..dc6b953
--- /dev/null
+++ b/scripts/mg2mmol.rb
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+require_relative '../lazar/lib/lazar'
+include OpenTox
+newfile = ARGV[0].sub(/.csv/,"_mmol.csv")
+p newfile
+CSV.open(newfile, "wb") do |csv|
+ CSV.read(ARGV[0]).each do |line|
+ smi,mg = line
+ if mg.numeric?
+ c = Compound.from_smiles smi
+ mmol = c.mg_to_mmol mg.to_f
+ csv << [smi, mmol]
+ else
+ csv << [smi, mg.gsub(/mg/,'mmol')]
+ end
+ end
+end
diff --git a/scripts/mmol2-log10.rb b/scripts/mmol2-log10.rb
new file mode 100644
index 0000000..0c99a0b
--- /dev/null
+++ b/scripts/mmol2-log10.rb
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+require_relative '../lib/lazar'
+include OpenTox
+newfile = ARGV[0].sub(/.csv/,"_log10.csv")
+p newfile
+CSV.open(newfile, "wb") do |csv|
+ CSV.read(ARGV[0]).each do |line|
+ smi,mmol = line
+ if mmol.numeric?
+ c = Compound.from_smiles smi
+ mmol = -Math.log10(mmol.to_f)
+ csv << [smi, mmol]
+ else
+ csv << [smi, "-log10(#{mmol})"]
+ end
+ end
+end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 46c6620..7308a83 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -1,8 +1,14 @@
require_relative "setup.rb"
+
class NanoparticleTest < MiniTest::Test
+ def setup
+ `mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}`
+ end
+
def test_import
+ skip
dataset_ids = Import::Enanomapper.import
assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
@@ -17,6 +23,7 @@ class NanoparticleTest < MiniTest::Test
end
def test_summaries
+ skip
features = Feature.all.to_a
#p features.collect do |f|
#f if f.category == "TOX"
@@ -51,6 +58,18 @@ class NanoparticleTest < MiniTest::Test
end
end
+ def test_create_model_with_feature_selection
+ training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+ feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
+ model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
+ nanoparticle = training_dataset.nanoparticles[-34]
+ #p nanoparticle.neighbors
+ prediction = model.predict nanoparticle
+ p prediction
+ #p prediction
+ refute_nil prediction[:value]
+ end
+
def test_create_model
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
@@ -66,7 +85,9 @@ class NanoparticleTest < MiniTest::Test
def test_validate_model
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
- model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
+ #model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
+ model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors"})
+ p model
cv = RegressionCrossValidation.create model
p cv
end