summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-06-02 17:54:48 +0200
committerChristoph Helma <helma@in-silico.ch>2016-06-02 17:54:48 +0200
commiteec5bddbd35c9ecee8021128508d8718bccb4fe3 (patch)
tree93765b1f0b97415e7df5abdbcab7086f8c2fa7cf
parent85f2308c101b4778508c2d767e08af4cfd671b7b (diff)
local pls regression for nanoparticle proteomics
-rw-r--r--lib/import.rb15
-rw-r--r--lib/nanoparticle.rb12
-rw-r--r--lib/regression.rb41
-rw-r--r--test/nanoparticles.rb36
4 files changed, 52 insertions, 52 deletions
diff --git a/lib/import.rb b/lib/import.rb
index 80d4579..4c49e5e 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -68,17 +68,10 @@ module OpenTox
effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
effect["conditions"].delete_if { |k, v| v.nil? }
if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-=begin
- JSON.parse(effect["result"]["textValue"]).each do |identifier, value|
- # time critical step
- t = Time.now
- proteomics_features[identifier] ||= klass.find_or_create_by(:name => identifier, :category => "Proteomics")
- t1 += Time.now - t
- t = Time.now
+ JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
+ proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics")
nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
- t2 += Time.now - t
end
-=end
else
feature = klass.find_or_create_by(
:name => effect["endpoint"],
@@ -90,10 +83,6 @@ module OpenTox
end
end
nanoparticle.save
- #p "Total time: #{Time.now - start_time}"
- #p "Proteomics features: #{t1}"
- #p "Proteomics values: #{t2}"
- #p "Time2: #{t2}"
end
datasets.each { |u,d| d.save }
end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 65aab23..3e29ae1 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -10,6 +10,7 @@ module OpenTox
attr_accessor :scaled_values
def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:
+ p name
dataset = Dataset.find(dataset_id)
relevant_features = {}
measurements = []
@@ -46,6 +47,7 @@ module OpenTox
end
end
end
+ #p relevant_features.keys.collect{|i| Feature.find(i).name}
neighbors = []
substances.each do |substance|
values = dataset.values(substance,prediction_feature_id)
@@ -86,9 +88,12 @@ module OpenTox
physchem_descriptors[feature.id.to_s] << value
physchem_descriptors[feature.id.to_s].uniq!
when "Proteomics"
- proteomics[feature.id.to_s] ||= []
- proteomics[feature.id.to_s] << value
- proteomics[feature.id.to_s].uniq!
+ #proteomics[feature.id.to_s] ||= []
+ #proteomics[feature.id.to_s] << value
+ #proteomics[feature.id.to_s].uniq!
+ physchem_descriptors[feature.id.to_s] ||= []
+ physchem_descriptors[feature.id.to_s] << value
+ physchem_descriptors[feature.id.to_s].uniq!
when "TOX"
if feature.name == "Total protein (BCA assay)"
physchem_descriptors[feature.id.to_s] ||= []
@@ -109,6 +114,7 @@ module OpenTox
def parse_ambit_value feature, v, dataset
#p dataset
#p feature
+ # TODO add study id to warnings
v.delete "unit"
# TODO: ppm instead of weights
if v.keys == ["textValue"]
diff --git a/lib/regression.rb b/lib/regression.rb
index 5028c78..b9067c6 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -88,35 +88,42 @@ module OpenTox
data_frame[j][i] = d[:scaled_value]
end
end if activities
- (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+ #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+ (0..pc_ids.size).each do |j| # for R: fill empty values with NA
data_frame[j] ||= []
data_frame[j][i] ||= "NA"
end
end
- remove_idx = []
- data_frame.each_with_index do |r,i|
- remove_idx << i if r.uniq.size == 1 # remove properties with a single value
- end
+ #remove_idx = []
+ #data_frame.each_with_index do |r,i|
+ #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment
+ #end
- remove_idx.reverse.each do |i|
- data_frame.delete_at i
- pc_ids.delete_at i
- end
+ #p data_frame.size
+ #p pc_ids.size
+ #data_frame.delete_if.with_index { |_, index| remove_idx.include? index }
+ #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 }
+ #remove_idx.sort.reverse.each do |i|
+ #p i
+ #data_frame.delete_at i
+ #pc_ids.delete_at i
+ #end
+ #p data_frame.size
+ #p pc_ids.size
if pc_ids.empty?
prediction = local_weighted_average substance, neighbors
prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
prediction
else
- query_descriptors = pc_ids.collect do |i|
- substance.scaled_values[i] ? substance.scaled_values[i] : "NA"
- end
+ query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
remove_idx = []
query_descriptors.each_with_index do |v,i|
- remove_idx << i if v == "NA"
+ #remove_idx << i if v == "NA"
+ remove_idx << i unless v
end
- remove_idx.reverse.each do |i|
+ remove_idx.sort.reverse.each do |i|
data_frame.delete_at i
pc_ids.delete_at i
query_descriptors.delete_at i
@@ -135,8 +142,9 @@ module OpenTox
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
R.assign "weights", training_weights
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
=begin
+=end
+rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
File.open("tmp.R","w+"){|f|
f.puts "suppressPackageStartupMessages({
library(iterators,lib=\"#{rlib}\")
@@ -159,10 +167,11 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
f.puts "names(fingerprint) <- features"
f.puts "prediction <- predict(model,fingerprint)"
}
-=end
R.eval "data <- #{r_data_frame}"
R.assign "features", training_features
+ p training_features.size
+ p R.eval("names(data)").to_ruby.size
begin
R.eval "names(data) <- append(c('activities'),features)" #
R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index b6a2f00..227f7db 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -9,19 +9,6 @@ class NanoparticleTest < MiniTest::Test
#Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
end
- def test_create_model_with_feature_selection
- skip
- training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
- feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)")
- model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :feature_selection_algorithm => "correlation_filter"})
- nanoparticle = training_dataset.nanoparticles[-34]
- #p nanoparticle.neighbors
- prediction = model.predict nanoparticle
- p prediction
- #p prediction
- refute_nil prediction[:value]
- end
-
def test_create_model
skip
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
@@ -34,12 +21,14 @@ class NanoparticleTest < MiniTest::Test
model.delete
end
- # TODO move to validation-statistics
def test_inspect_cv
cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last
+ p cv
+ p cv.id
cv.correlation_plot_id = nil
File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
p cv.statistics
+ #p cv.model.training_dataset.substances.first.physchem_descriptors.keys.collect{|d| Feature.find(d).name}
end
def test_inspect_worst_prediction
@@ -67,26 +56,33 @@ class NanoparticleTest < MiniTest::Test
model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}})
cv = RegressionCrossValidation.create model
- p cv
- #p cv.predictions.sort_by{|sid,p| (p["value"] - p["measurements"].median).abs}
p cv.rmse
p cv.r_squared
#File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
refute_nil cv.r_squared
refute_nil cv.rmse
end
+
def test_validate_pls_model
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
- #feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)")
feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX")
model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}})
cv = RegressionCrossValidation.create model
- p cv
- #p cv.predictions.sort_by{|sid,p| (p["value"] - p["measurements"].median).abs}
p cv.rmse
p cv.r_squared
- File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
+ refute_nil cv.r_squared
+ refute_nil cv.rmse
+ end
+
+ def test_validate_proteomics_pls_model
+ training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+ feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX")
+
+ model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "proteomics_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}})
+ cv = RegressionCrossValidation.create model
+ p cv.rmse
+ p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end