summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-11-02 20:34:44 +0100
committerhelma@in-silico.ch <helma@in-silico.ch>2018-11-02 20:34:44 +0100
commit3a9c9332b660d35720ad4fa1f55ee0883e53aecd (patch)
tree0c4b552ba434c4e03ea6e7808ead5a2d4ecb4cec /lib
parent5b08a8c6d8e5567d253bec92d5bf5d18fd040cdc (diff)
warnings fixed, cleanup
Diffstat (limited to 'lib')
-rw-r--r--lib/caret.rb11
-rw-r--r--lib/classification.rb2
-rw-r--r--lib/compound.rb30
-rw-r--r--lib/crossvalidation.rb5
-rw-r--r--lib/dataset.rb31
-rw-r--r--lib/import.rb~6
-rw-r--r--lib/leave-one-out-validation.rb9
-rw-r--r--lib/model.rb16
-rw-r--r--lib/regression.rb2
-rw-r--r--lib/substance.rb1
-rw-r--r--lib/train-test-validation.rb11
-rw-r--r--lib/validation-statistics.rb1
-rw-r--r--lib/validation.rb2
13 files changed, 29 insertions, 98 deletions
diff --git a/lib/caret.rb b/lib/caret.rb
index 8bccf74..2e5f1bc 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -22,11 +22,11 @@ module OpenTox
end
if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
+ prediction[:warnings] = ["No variables for regression model, using weighted average of similar substances (no prediction interval available)."]
elsif
dependent_variables.size < 3
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
+ prediction[:warnings] = ["Insufficient number of neighbors (#{dependent_variables.size}) for regression model, using weighted average of similar substances (no prediction interval available)."]
else
dependent_variables.each_with_index do |v,i|
dependent_variables[i] = to_r(v)
@@ -51,7 +51,8 @@ module OpenTox
$logger.debug dependent_variables
$logger.debug independent_variables
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
+ prediction[:warnings] ||= []
+ prediction[:warnings] << "R caret model creation error, using weighted average of similar substances (no prediction interval available)."
return prediction
end
begin
@@ -72,12 +73,12 @@ module OpenTox
$logger.debug "R caret prediction error for:"
$logger.debug self.inspect
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
+ prediction[:warnings] << "R caret prediction error, using weighted average of similar substances (no prediction interval available)."
return prediction
end
if prediction.nil? or prediction[:value].nil?
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
+ prediction[:warnings] << "Empty R caret prediction, using weighted average of similar substances (no prediction interval available)."
end
end
prediction
diff --git a/lib/classification.rb b/lib/classification.rb
index e78783b..638492b 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -21,7 +21,7 @@ module OpenTox
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
p_max = probabilities.collect{|a,p| p}.max
prediction = probabilities.key(p_max)
- {:value => prediction,:probabilities => probabilities,:warnings => []}
+ {:value => prediction,:probabilities => probabilities}
end
end
diff --git a/lib/compound.rb b/lib/compound.rb
index 8b4bb48..6d0e075 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -230,36 +230,6 @@ module OpenTox
update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
self["cid"]
end
-
- def db_neighbors min_sim: 0.2, dataset_id:
- #p fingerprints[DEFAULT_FINGERPRINT]
- # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
-
- #qn = default_fingerprint_size
- #qmin = qn * threshold
- #qmax = qn / threshold
- #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
- #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
- aggregate = [
- #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
- #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
- {'$project' => {
- 'similarity' => {'$let' => {
- 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
- 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
- }},
- '_id' => 1,
- #'measurements' => 1,
- 'dataset_ids' => 1
- }},
- {'$match' => {'similarity' => {'$gte' => min_sim}}},
- {'$sort' => {'similarity' => -1}}
- ]
-
- # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
- $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
-
- end
# Convert mmol to mg
# @return [Float] value in mg
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 4f61ff4..c866ebc 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -24,8 +24,6 @@ module OpenTox
)
cv.save # set created_at
- nr_instances = 0
- nr_unpredicted = 0
training_dataset = model.training_dataset
training_dataset.folds(n).each_with_index do |fold,fold_nr|
#fork do # parallel execution of validations can lead to Rserve and memory problems
@@ -33,12 +31,9 @@ module OpenTox
t = Time.now
validation = TrainTest.create(model, fold[0], fold[1])
cv.validation_ids << validation.id
- cv.nr_instances += validation.nr_instances
- cv.nr_unpredicted += validation.nr_unpredicted
$logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
end
cv.save
- $logger.debug "Nr unpredicted: #{cv.nr_unpredicted}"
cv.statistics
cv.update_attributes(finished_at: Time.now)
cv
diff --git a/lib/dataset.rb b/lib/dataset.rb
index d02a302..42733e4 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -310,10 +310,6 @@ module OpenTox
end
all_substances << substance
- substance.dataset_ids << self.id
- substance.dataset_ids.uniq!
- substance.save
-
add substance, original_id, original_id_value
vals.each_with_index do |v,j|
@@ -422,6 +418,7 @@ module OpenTox
# @param [Integer] number of folds
# @return [Array] Array with folds [training_dataset,test_dataset]
def folds n
+ $logger.debug "Creating #{n} folds for #{name}."
len = self.substances.size
indices = (0..len-1).to_a.shuffle
mid = (len/n)
@@ -431,19 +428,15 @@ module OpenTox
last = start+mid
last = last-1 unless len%n >= i
test_idxs = indices[start..last] || []
- test_substances = test_idxs.collect{|i| substances[i]}
+ test_substances = test_idxs.collect{|i| substances[i].id}
training_idxs = indices-test_idxs
- training_substances = training_idxs.collect{|i| substances[i]}
+ training_substances = training_idxs.collect{|i| substances[i].id}
chunk = [training_substances,test_substances].collect do |substances|
- dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
- substances.each do |substance|
- substance.dataset_ids << dataset.id
- substance.dataset_ids.uniq!
- substance.save
- dataset.data_entries += data_entries.select{|row| row[0] == substance.id}
- end
- dataset.save
- dataset
+ self.class.create(
+ :name => "#{self.name} (Fold #{i-1})",
+ :source => self.id,
+ :data_entries => data_entries.select{|row| substances.include? row[0]}
+ )
end
start = last+1
chunks << chunk
@@ -468,7 +461,7 @@ module OpenTox
if features.first.kind_of? NominalFeature
merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(", ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps)
else
- merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO, :transformations
+ merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations
end
else
bad_request_error "Cannot merge features of different types (#{feature_classes})."
@@ -521,12 +514,6 @@ module OpenTox
def transform # TODO
end
- # Delete dataset
- def delete
- compounds.each{|c| c.dataset_ids.delete id.to_s}
- super
- end
-
end
end
diff --git a/lib/import.rb~ b/lib/import.rb~
index 0857717..cf1a26f 100644
--- a/lib/import.rb~
+++ b/lib/import.rb~
@@ -47,9 +47,9 @@ module OpenTox
:core_id => core_id,
:coating_ids => coating_ids
)
- np["bundles"].keys.each do |bundle_uri|
- nanoparticle.dataset_ids << datasets[bundle_uri].id
- end
+ #np["bundles"].keys.each do |bundle_uri|
+ #nanoparticle.dataset_ids << datasets[bundle_uri].id
+ #end
studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"]
studies.each do |study|
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index b0905b8..d37b6ce 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -16,18 +16,11 @@ module OpenTox
loo = klass.new :model_id => model.id
predictions = model.predict model.training_dataset.substances
predictions.each{|cid,p| p.delete(:neighbors)}
- nr_unpredicted = 0
predictions.each do |cid,prediction|
- if prediction[:value]
- prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
- else
- nr_unpredicted += 1
- end
+ prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
end
predictions.select!{|cid,p| p[:value] and p[:measurements]}
- loo.nr_instances = predictions.size
- loo.nr_unpredicted = nr_unpredicted
loo.predictions = predictions
loo.statistics
$logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
diff --git a/lib/model.rb b/lib/model.rb
index f50fcd7..a0c60f0 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -38,7 +38,6 @@ module OpenTox
def self.create prediction_feature:nil, training_dataset:, algorithms:{}
bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
- # TODO: prediction_feature without training_dataset: use all available data
# guess model type
prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
@@ -198,9 +197,8 @@ module OpenTox
# Predict a substance (compound or nanoparticle)
# @param [OpenTox::Substance]
# @return [Hash]
- def predict_substance substance, threshold = self.algorithms[:similarity][:min]
+ def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil
- t = Time.now
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
when /tanimoto/ # binary features
@@ -229,7 +227,7 @@ module OpenTox
bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
end
- prediction = {:warnings => [], :measurements => []}
+ prediction ||= {:warnings => [], :measurements => []}
prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
neighbor_ids = []
neighbor_similarities = []
@@ -240,7 +238,7 @@ module OpenTox
substance_ids.each_with_index do |s,i|
# handle query substance
if substance.id.to_s == s
- prediction[:measurements] << dependent_variables[i]
+ prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass
prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
else
if fingerprints?
@@ -277,17 +275,13 @@ module OpenTox
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
prediction.merge! result
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
- #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
- #prediction[:warnings] << "Closest neighbor has similarity #{neighbor_similarities.max} < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
- #end
end
if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
prediction
else # try again with a lower threshold
- predict_substance substance, 0.2
+ prediction[:warnings] << "Lowering similarity threshold to 0.2."
+ predict_substance substance, 0.2, prediction
end
- #p Time.now - t
- prediction
end
# Predict a substance (compound or nanoparticle), an array of substances or a dataset
diff --git a/lib/regression.rb b/lib/regression.rb
index 25c0732..fd2855f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -17,7 +17,7 @@ module OpenTox
sim_sum += weights[i]
end if dependent_variables
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
- {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
+ {:value => prediction}
end
end
diff --git a/lib/substance.rb b/lib/substance.rb
index ef49659..5c486d8 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -3,7 +3,6 @@ module OpenTox
# Base class for substances (e.g. compunds, nanoparticles)
class Substance
field :properties, type: Hash, default: {}
- field :dataset_ids, type: Array, default: []
end
end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
index bffee8c..d034cd1 100644
--- a/lib/train-test-validation.rb
+++ b/lib/train-test-validation.rb
@@ -18,22 +18,15 @@ module OpenTox
validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
validation_model.save
predictions = validation_model.predict test_set.substances
- nr_unpredicted = 0
predictions.each do |cid,prediction|
- if prediction[:value]
- prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
- else
- nr_unpredicted += 1
- end
+ prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
end
predictions.select!{|cid,p| p[:value] and p[:measurements]}
- # hack to avoid mongos file size limit error on large datasets
+ # remove neighbors to avoid mongos file size limit error on large datasets
predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i)
validation = self.new(
:model_id => validation_model.id,
:test_dataset_id => test_set.id,
- :nr_instances => test_set.substances.size,
- :nr_unpredicted => nr_unpredicted,
:predictions => predictions
)
validation.save
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 7bae891..ad4c14d 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -82,6 +82,7 @@ module OpenTox
self.weighted_accuracy[a] = (weighted_confusion_matrix[a][0][0]+weighted_confusion_matrix[a][1][1])/confidence_sum[a].to_f
end
$logger.debug "Accuracy #{accuracy}"
+ $logger.debug "Nr Predictions #{nr_predictions}"
save
{
:accept_values => accept_values,
diff --git a/lib/validation.rb b/lib/validation.rb
index c9954b6..9402361 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -10,8 +10,6 @@ module OpenTox
store_in collection: "validations"
field :name, type: String
field :model_id, type: BSON::ObjectId
- field :nr_instances, type: Integer, default: 0
- field :nr_unpredicted, type: Integer, default: 0
field :predictions, type: Hash, default: {}
field :finished_at, type: Time