summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-10-07 10:25:58 +0200
committerChristoph Helma <helma@in-silico.ch>2016-10-07 10:25:58 +0200
commit91787edb3682900bc5a2feeca66e5142f387fcc6 (patch)
treefcb189bb8e84c9b3727b29f0846a5f820b596aee
parent4348eec89033e6677c9f628646fc67bd03c73fe6 (diff)
unified interface for prediction algorithms
-rw-r--r--lib/caret.rb152
-rw-r--r--lib/classification.rb2
-rw-r--r--lib/crossvalidation.rb4
-rw-r--r--lib/dataset.rb2
-rw-r--r--lib/feature.rb18
-rw-r--r--lib/import.rb3
-rw-r--r--lib/nanoparticle.rb50
-rw-r--r--lib/physchem.rb6
-rw-r--r--lib/regression.rb2
-rw-r--r--test/nanoparticles.rb129
10 files changed, 186 insertions, 182 deletions
diff --git a/lib/caret.rb b/lib/caret.rb
new file mode 100644
index 0000000..b999b06
--- /dev/null
+++ b/lib/caret.rb
@@ -0,0 +1,152 @@
+module OpenTox
+ module Algorithm
+
+ class Caret
+ # TODO classification
+ # model list: https://topepo.github.io/caret/modelList.html
+
+ attr_accessor :descriptors, :neighbors, :method, :relevant_features, :data_frame, :feature_names, :weights, :query_features
+
+ def initialize descriptors:, neighbors:, method:, relevant_features:
+ @descriptors = descriptors
+ @neighbors = neighbors
+ @method = method
+ @relevant_features = relevant_features
+ end
+
+ def self.regression descriptors:, neighbors:, method:, relevant_features:nil
+
+ caret = new(descriptors:descriptors, neighbors:neighbors, method:method, relevant_features:relevant_features)
+ # collect training data for R
+ if descriptors.is_a? Array
+ caret.fingerprint2R
+ elsif descriptors.is_a? Hash
+ caret.properties2R
+ else
+ bad_request_error "Descriptors should be a fingerprint (Array) or properties (Hash). Cannot handle '#{descriptors.class}'."
+ end
+ if caret.feature_names.empty? or caret.data_frame.flatten.uniq == ["NA"]
+ prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors)
+ prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ else
+ prediction = caret.r_model_prediction
+ if prediction.nil? or prediction[:value].nil?
+ prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors)
+ prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
+ end
+ end
+ prediction
+
+ end
+
+ def fingerprint2R
+
+ values = []
+ features = {}
+ @weights = []
+ descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort
+
+ neighbors.each do |n|
+ activities = n["measurements"]
+ activities.each do |act|
+ values << act
+ @weights << n["similarity"]
+ descriptor_ids.each do |id|
+ features[id] ||= []
+ features[id] << n["descriptors"].include?(id)
+ end
+ end if activities
+ end
+
+ @feature_names = []
+ @data_frame = [values]
+
+ features.each do |k,v|
+ unless v.uniq.size == 1
+ @data_frame << v.collect{|m| m ? "T" : "F"}
+ @feature_names << k
+ end
+ end
+ @query_features = @feature_names.collect{|f| descriptors.include?(f) ? "T" : "F"}
+
+ end
+
+
+ def properties2R
+
+ @weights = []
+ @feature_names = []
+ @query_features = []
+
+ # keep only descriptors with values
+ @relevant_features.keys.each_with_index do |f,i|
+ if @descriptors[f]
+ @feature_names << f
+ @query_features << @descriptors[f].median
+ else
+ neighbors.each do |n|
+ n["descriptors"].delete_at i
+ end
+ end
+ end
+
+ measurements = neighbors.collect{|n| n["measurements"]}.flatten
+ # initialize data frame with 'NA' defaults
+ @data_frame = Array.new(@feature_names.size+1){Array.new(measurements.size,"NA") }
+
+ i = 0
+ # parse neighbor activities and descriptors
+ neighbors.each do |n|
+ activities = n["measurements"]
+ activities.each do |act| # multiple measurements are treated as separate instances
+ unless n["descriptors"].include?(nil)
+ data_frame[0][i] = act
+ @weights << n["similarity"]
+ n["descriptors"].each_with_index do |d,j|
+ @data_frame[j+1][i] = d
+ end
+ i += 1
+ end
+ end if activities # ignore neighbors without measurements
+ end
+
+ end
+
+ def r_model_prediction
+ begin
+ R.assign "weights", @weights
+ r_data_frame = "data.frame(#{@data_frame.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+ R.eval "data <- #{r_data_frame}"
+ R.assign "features", @feature_names
+ R.eval "names(data) <- append(c('activities'),features)" #
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
+ rescue => e
+ $logger.debug "R caret model creation error for:"
+ $logger.debug JSON.pretty_generate(self.inspect)
+ return nil
+ end
+ begin
+ R.eval "query <- data.frame(rbind(c(#{@query_features.join ','})))"
+ R.eval "names(query) <- features"
+ R.eval "prediction <- predict(model,query)"
+ value = R.eval("prediction").to_f
+ rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
+ r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
+ prediction_interval = value-1.96*rmse, value+1.96*rmse
+ {
+ :value => value,
+ :rmse => rmse,
+ :r_squared => r_squared,
+ :prediction_interval => prediction_interval
+ }
+ rescue => e
+ $logger.debug "R caret prediction error for:"
+ $logger.debug self.inspect
+ return nil
+ end
+ end
+
+ end
+ end
+end
+
diff --git a/lib/classification.rb b/lib/classification.rb
index 01ba878..6582e7d 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -3,7 +3,7 @@ module OpenTox
class Classification
- def self.weighted_majority_vote descriptors:nil, neighbors:
+ def self.weighted_majority_vote descriptors:nil, neighbors:, method:nil, relevant_features:nil
sims = {}
neighbors.each do |neighbor|
sim = neighbor["similarity"]
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index d7a1f08..15d1031 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -16,10 +16,10 @@ module OpenTox
folds: n
)
cv.save # set created_at
+
nr_instances = 0
nr_unpredicted = 0
- #predictions = {}
- training_dataset = Dataset.find model.training_dataset_id
+ training_dataset = model.training_dataset
training_dataset.folds(n).each_with_index do |fold,fold_nr|
#fork do # parallel execution of validations can lead to Rserve and memory problems
$logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 2e21e5b..453fc35 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -165,11 +165,9 @@ module OpenTox
feature = nil
if values.size == 0 # empty feature
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
- metadata["numeric"] = true
numeric[i] = true
feature = NumericFeature.find_or_create_by(metadata)
else
- metadata["nominal"] = true
metadata["accept_values"] = values
numeric[i] = false
feature = NominalFeature.find_or_create_by(metadata)
diff --git a/lib/feature.rb b/lib/feature.rb
index c6fb68a..0ca4d41 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -2,30 +2,28 @@ module OpenTox
# Basic feature class
class Feature
- field :nominal, type: Boolean
- field :numeric, type: Boolean
field :measured, type: Boolean
field :calculated, type: Boolean
field :category, type: String
field :unit, type: String
field :conditions, type: Hash
+
+ def nominal?
+ self.class == NominalFeature
+ end
+
+ def numeric?
+ self.class == NumericFeature
+ end
end
# Feature for categorical variables
class NominalFeature < Feature
field :accept_values, type: Array
- def initialize params
- super params
- nominal = true
- end
end
# Feature for quantitative variables
class NumericFeature < Feature
- def initialize params
- super params
- numeric = true
- end
end
# Feature for SMARTS fragments
diff --git a/lib/import.rb b/lib/import.rb
index 17894a9..8e57401 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -76,7 +76,7 @@ module OpenTox
if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
- proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source)
+ proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
end
else
@@ -98,6 +98,7 @@ module OpenTox
:category => category,
:conditions => effect["conditions"],
:source => study["protocol"]["category"]["term"],
+ :measured => true,
:warnings => warnings
)
nanoparticle.parse_ambit_value feature, effect["result"], dataset
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 6905f6f..f74f263 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -8,54 +8,6 @@ module OpenTox
attr_accessor :scaled_values
-=begin
- def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features:
- dataset = Dataset.find(dataset_id)
- #relevant_features = {}
- measurements = []
- substances = []
- # TODO: exclude query activities!!!
- dataset.substances.each do |s|
- if s.core == self.core # exclude nanoparticles with different core
- dataset.values(s,prediction_feature_id).each do |act|
- measurements << act
- substances << s
- end
- end
- end
- neighbors = []
- substances.each do |substance|
- values = dataset.values(substance,prediction_feature_id)
- if values
- common_descriptors = relevant_features.keys & substance.descriptors.keys
- # scale values
- query_descriptors = common_descriptors.collect{|d| (descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
- @scaled_values = common_descriptors.collect{|d| [d,(descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h
- neighbor_descriptors = common_descriptors.collect{|d| (substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]}
- neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h
- #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]}
- weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2}
- sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights)
- neighbors << {
- "_id" => substance.id,
- "measurements" => values,
- "similarity" => sim,
- "common_descriptors" => common_descriptors.collect do |id|
- {
- :id => id,
- :scaled_value => neighbor_scaled_values[id],
- :p_value => relevant_features[id]["p_value"],
- :r_squared => relevant_features[id]["r"]**2}
- end
- } if sim >= min_sim
- end
- end
- $logger.debug "#{self.name}: #{neighbors.size} neighbors"
- neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
- neighbors
- end
-=end
-
def add_feature feature, value, dataset
unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
case feature.category
@@ -78,8 +30,6 @@ module OpenTox
end
def parse_ambit_value feature, v, dataset
- #p dataset
- #p feature
# TODO add study id to warnings
v.delete "unit"
# TODO: ppm instead of weights
diff --git a/lib/physchem.rb b/lib/physchem.rb
index 86300ba..c32e382 100644
--- a/lib/physchem.rb
+++ b/lib/physchem.rb
@@ -42,7 +42,7 @@ module OpenTox
def self.descriptors desc=DESCRIPTORS
desc.collect do |name,description|
lib,desc = name.split('.',2)
- self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
end
end
@@ -54,11 +54,11 @@ module OpenTox
CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
dname = "#{name}.#{n}"
description = DESCRIPTORS[dname]
- udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
end
else
description = DESCRIPTORS[name]
- udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
end
end
udesc
diff --git a/lib/regression.rb b/lib/regression.rb
index cf6d9cb..0e5e06b 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,7 @@ module OpenTox
class Regression
- def self.weighted_average descriptors:nil, neighbors:, parameters:nil
+ def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil
# TODO: prediction_interval
weighted_sum = 0.0
sim_sum = 0.0
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 9b2d2d9..074a429 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -14,57 +14,18 @@ class NanoparticleTest < MiniTest::Test
end
def test_create_model
- model = Model::Lazar.create training_dataset: @training_dataset
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
nanoparticle = @training_dataset.nanoparticles[-34]
prediction = model.predict nanoparticle
- p prediction
refute_nil prediction[:value]
assert_includes nanoparticle.dataset_ids, @training_dataset.id
+ asser_true @prediction_feature.measured
model.delete
end
- def test_inspect_cv
- skip
- cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last
- #p cv
- #p cv.id
- #cv.correlation_plot_id = nil
- File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
- #p cv.statistics
- #p cv.model.@training_dataset.substances.first.physchem_descriptors.keys.collect{|d| Feature.find(d).name}
- CrossValidation.all.sort_by{|cv| cv.created_at}.reverse.each do |cv|
- p cv.name
- p cv.created_at
- begin
- p cv.r_squared
- rescue
- end
- end
- end
- def test_inspect_worst_prediction
- skip
-
- cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last
- worst_predictions = cv.worst_predictions(n: 3,show_neigbors: false)
- assert_equal 3, worst_predictions.size
- assert_kind_of Integer, worst_predictions.first[:neighbors]
- worst_predictions = cv.worst_predictions
- assert_equal 5, worst_predictions.size
- assert_kind_of Array, worst_predictions.first[:neighbors]
- assert_kind_of Integer, worst_predictions.first[:neighbors].first[:common_descriptors]
- puts worst_predictions.to_yaml
- worst_predictions = cv.worst_predictions(n: 2, show_common_descriptors: true)
- #puts worst_predictions.to_yaml
- assert_equal 2, worst_predictions.size
- assert_kind_of Array, worst_predictions.first[:neighbors]
- refute_nil worst_predictions.first[:neighbors].first[:common_descriptors]
- #p cv.model.training_dataset.features
- end
-
- def test_validate_model
- algorithms = { :prediction => {:method => "Algorithm::Regression.weighted_average" } }
- model = Model::Lazar.create training_dataset: @training_dataset
- cv = RegressionCrossValidation.create model
+ def test_validate_default_nanoparticle_model
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
+ cv = CrossValidation.create model
p cv.rmse
p cv.r_squared
#File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
@@ -72,62 +33,42 @@ class NanoparticleTest < MiniTest::Test
refute_nil cv.rmse
end
- def test_validate_pls_model
+ def test_validate_pls_nanoparticle_model
algorithms = {
- :descriptors => {
- :method => "properties",
- :types => ["P-CHEM"]
- },
- :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'pls' },
+ :descriptors => { :types => ["P-CHEM"] },
+ :prediction => {:parameters => 'pls' },
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
- cv = RegressionCrossValidation.create model
+ assert_equal "pls", model.algorithms[:prediction][:method]
+ cv = CrossValidation.create model
p cv.rmse
p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
- def test_validate_random_forest_model
+ def test_validate_proteomics_pls_nanoparticle_model
algorithms = {
- :descriptors => {
- :method => "properties",
- :types => ["P-CHEM"]
- },
- :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'rf' }
+ :descriptors => { :types => ["Proteomics"] },
+ :prediction => { :parameters => 'pls' }
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
- cv = RegressionCrossValidation.create model
+ assert_equal "pls", model.algorithms[:prediction][:method]
+ cv = CrossValidation.create model
p cv.rmse
p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
- def test_validate_proteomics_pls_model
- algorithms = {
- :descriptors => {
- :method => "properties",
- :types => ["Proteomics"]
- },
- :prediction => {:method => "Algorithm::Caret.regression", :parameters => 'rf' }
- }
- model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
- cv = RegressionCrossValidation.create model
- p cv.rmse
- p cv.r_squared
- refute_nil cv.r_squared
- refute_nil cv.rmse
- end
-
- def test_validate_all_default_model
+ def test_validate_all_default_nanoparticle_model
algorithms = {
:descriptors => {
:types => ["Proteomics","P-CHEM"]
},
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
- cv = RegressionCrossValidation.create model
+ cv = CrossValidation.create model
p cv.rmse
p cv.r_squared
refute_nil cv.r_squared
@@ -141,42 +82,6 @@ class NanoparticleTest < MiniTest::Test
end
end
- def test_summaries
- skip
- datasets = Dataset.all
- datasets = datasets.select{|d| !d.name.nil?}
- datasets.each do |d|
-
- #p d.features.select{|f| f.name.match (/Total/)}
- #p d.features.collect{|f| "#{f.name} #{f.unit} #{f.conditions}"}
- p d.features.uniq.collect{|f| f.name}
- end
- assert_equal 9, datasets.size
-=begin
- features = Feature.all.to_a
- #p features.collect do |f|
- #f if f.category == "TOX"
- #end.to_a.flatten.size
- toxcounts = {}
- pccounts = {}
- Nanoparticle.all.each do |np|
- np.measurements.each do |t,v|
- toxcounts[t] ||= 0
- toxcounts[t] += 1#v.uniq.size
- end
- np.physchem_descriptors.each do |t,v|
- pccounts[t] ||= 0
- pccounts[t] += 1#v.uniq.size
- end
- end
- #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml
- #pccounts.each{|e,n| p Feature.find(e),n if n > 100}
- #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq
- toxcounts.each{|e,n| p Feature.find(e),n if n > 100}
-=end
- end
-
-
def test_import_ld
skip
dataset_ids = Import::Enanomapper.import_ld