summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/algorithm.rb13
-rw-r--r--lib/caret.rb96
-rw-r--r--lib/classification.rb36
-rw-r--r--lib/compound.rb104
-rw-r--r--lib/crossvalidation.rb388
-rw-r--r--lib/dataset.rb279
-rw-r--r--lib/feature.rb29
-rw-r--r--lib/feature_selection.rb42
-rw-r--r--lib/import.rb122
-rw-r--r--lib/lazar.rb18
-rw-r--r--lib/leave-one-out-validation.rb238
-rw-r--r--lib/model.rb449
-rw-r--r--lib/nanoparticle.rb92
-rw-r--r--lib/opentox.rb11
-rw-r--r--lib/overwrite.rb27
-rw-r--r--lib/physchem.rb21
-rw-r--r--lib/regression.rb144
-rw-r--r--lib/rest-client-wrapper.rb6
-rw-r--r--lib/similarity.rb65
-rw-r--r--lib/substance.rb8
-rw-r--r--lib/train-test-validation.rb69
-rw-r--r--lib/validation-statistics.rb225
-rw-r--r--lib/validation.rb117
23 files changed, 1428 insertions, 1171 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 113f847..0e4b93a 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -2,18 +2,9 @@ module OpenTox
module Algorithm
- # Generic method to execute algorithms
- # Algorithms should:
- # - accept a Compound, an Array of Compounds or a Dataset as first argument
- # - optional parameters as second argument
- # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
- # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
- # @param [Hash] Algorithm parameters
- # @return Algorithm result
- def self.run algorithm, object, parameters=nil
- bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
+ def self.run algorithm, parameters=nil
klass,method = algorithm.split('.')
- parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
+ Object.const_get(klass).send(method,parameters)
end
end
diff --git a/lib/caret.rb b/lib/caret.rb
new file mode 100644
index 0000000..7e4f771
--- /dev/null
+++ b/lib/caret.rb
@@ -0,0 +1,96 @@
+module OpenTox
+ module Algorithm
+
+ class Caret
+ # model list: https://topepo.github.io/caret/modelList.html
+
+ def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
+ remove = []
+ # remove independent_variables with single values
+ independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1}
+ remove.sort.reverse.each do |i|
+ independent_variables.delete_at i
+ query_variables.delete_at i
+ end
+ if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
+ prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ elsif
+ dependent_variables.size < 3
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
+ prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
+
+ else
+ dependent_variables.each_with_index do |v,i|
+ dependent_variables[i] = to_r(v)
+ end
+ independent_variables.each_with_index do |c,i|
+ c.each_with_index do |v,j|
+ independent_variables[i][j] = to_r(v)
+ end
+ end
+ query_variables.each_with_index do |v,i|
+ query_variables[i] = to_r(v)
+ end
+ begin
+ R.assign "weights", weights
+ r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+ R.eval "data <- #{r_data_frame}"
+ R.assign "features", (0..independent_variables.size-1).to_a
+ R.eval "names(data) <- append(c('activities'),features)" #
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
+ rescue => e
+ $logger.debug "R caret model creation error for:"
+ $logger.debug dependent_variables
+ $logger.debug independent_variables
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
+ prediction[:warning] = "R caret model creation error. Using weighted average of similar substances."
+ return prediction
+ end
+ begin
+ R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))"
+ R.eval "names(query) <- features"
+ R.eval "prediction <- predict(model,query)"
+ value = R.eval("prediction").to_f
+ rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
+ r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
+ prediction_interval = value-1.96*rmse, value+1.96*rmse
+ prediction = {
+ :value => value,
+ :rmse => rmse,
+ :r_squared => r_squared,
+ :prediction_interval => prediction_interval
+ }
+ rescue => e
+ $logger.debug "R caret prediction error for:"
+ $logger.debug self.inspect
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
+ prediction[:warning] = "R caret prediction error. Using weighted average of similar substances"
+ return prediction
+ end
+ if prediction.nil? or prediction[:value].nil?
+ prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
+ prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
+ end
+ end
+ prediction
+
+ end
+
+ # call caret methods dynamically, e.g. Caret.pls
+ def self.method_missing(sym, *args, &block)
+ args.first[:method] = sym.to_s
+ self.create_model_and_predict args.first
+ end
+
+ def self.to_r v
+ return "F" if v == false
+ return "T" if v == true
+ return nil if v.is_a? Float and v.nan?
+ v
+ end
+
+ end
+ end
+end
+
diff --git a/lib/classification.rb b/lib/classification.rb
index 0202940..e8c179f 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -3,32 +3,24 @@ module OpenTox
class Classification
- def self.weighted_majority_vote compound, params
- neighbors = params[:neighbors]
- weighted_sum = {}
- sim_sum = 0.0
- confidence = 0.0
- neighbors.each do |row|
- sim = row["tanimoto"]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- weighted_sum[act] ||= 0
- weighted_sum[act] += sim
- end
+ def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:
+ class_weights = {}
+ dependent_variables.each_with_index do |v,i|
+ class_weights[v] ||= []
+ class_weights[v] << weights[i] unless v.nil?
end
- case weighted_sum.size
- when 1
- return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
- when 2
- sim_sum = weighted_sum[weighted_sum.keys[0]]
- sim_sum -= weighted_sum[weighted_sum.keys[1]]
- sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
- confidence = (sim_sum/neighbors.size).abs
- return {:value => prediction,:confidence => confidence}
- else
- bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
+ probabilities = {}
+ class_weights.each do |a,w|
+ probabilities[a] = w.sum/weights.sum
end
+ probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
+ p_max = probabilities.collect{|a,p| p}.max
+ prediction = probabilities.key(p_max)
+ {:value => prediction,:probabilities => probabilities}
end
+
end
+
end
end
diff --git a/lib/compound.rb b/lib/compound.rb
index e002305..8a1143b 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -1,11 +1,9 @@
-CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
+CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
module OpenTox
- class Compound
+ class Compound < Substance
require_relative "unique_descriptors.rb"
- include OpenTox
-
DEFAULT_FINGERPRINT = "MP2D"
field :inchi, type: String
@@ -19,9 +17,6 @@ module OpenTox
field :sdf_id, type: BSON::ObjectId
field :fingerprints, type: Hash, default: {}
field :default_fingerprint_size, type: Integer
- field :physchem_descriptors, type: Hash, default: {}
- field :dataset_ids, type: Array, default: []
- field :features, type: Hash, default: {}
index({smiles: 1}, {unique: true})
@@ -80,9 +75,8 @@ module OpenTox
fingerprints[type]
end
- def physchem descriptors=PhysChem.openbabel_descriptors
- # TODO: speedup java descriptors
- calculated_ids = physchem_descriptors.keys
+ def calculate_properties descriptors=PhysChem::OPENBABEL
+ calculated_ids = properties.keys
# BSON::ObjectId instances are not allowed as keys in a BSON document.
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
descs = {}
@@ -95,11 +89,11 @@ module OpenTox
# avoid recalculating Cdk features with multiple values
descs.keys.uniq.each do |k|
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
- physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+ properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
end
end
save
- physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+ descriptors.collect{|d| properties[d.id.to_s]}
end
def smarts_match smarts, count=false
@@ -142,9 +136,6 @@ module OpenTox
# @param inchi [String] smiles InChI string
# @return [OpenTox::Compound] Compound
def self.from_inchi inchi
- # Temporary workaround for OpenBabels Inchi bug
- # http://sourceforge.net/p/openbabel/bugs/957/
- # bug has not been fixed in latest git/development version
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
smiles = obconversion(inchi,"inchi","can")
if smiles.empty?
@@ -246,7 +237,7 @@ module OpenTox
# @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
def cid
- pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
+ pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
self["cid"]
end
@@ -254,70 +245,13 @@ module OpenTox
# @return [String] ChEMBL database compound id, derieved via restcall to chembl
def chemblid
# https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
- uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
+ uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
self["chemblid"]
end
- def fingerprint_count_neighbors params
- # TODO fix
- neighbors = []
- query_fingerprint = self.fingerprint params[:type]
- training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
- unless self == compound
- candidate_fingerprint = compound.fingerprint params[:type]
- features = (query_fingerprint + candidate_fingerprint).uniq
- min_sum = 0
- max_sum = 0
- features.each do |f|
- min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
- min_sum += min
- max_sum += max
- end
- max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
- neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
- end
- end
- neighbors.sort{|a,b| b.last <=> a.last}
- end
-
- def fingerprint_neighbors params
- bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
- neighbors = []
- if params[:type] == DEFAULT_FINGERPRINT
- neighbors = db_neighbors params
- else
- query_fingerprint = self.fingerprint params[:type]
- training_dataset = Dataset.find(params[:training_dataset_id])
- prediction_feature = training_dataset.features.first
- training_dataset.compounds.each do |compound|
- candidate_fingerprint = compound.fingerprint params[:type]
- sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
- feature_values = training_dataset.values(compound,prediction_feature)
- neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
- end
- neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
- end
- neighbors
- end
-
- def physchem_neighbors params
- feature_dataset = Dataset.find params[:feature_dataset_id]
- query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
- neighbors = []
- feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
- # TODO implement pearson and cosine similarity separatly
- R.assign "x", query_fingerprint
- R.assign "y", candidate_fingerprint
- sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
- if sim >= params[:min_sim]
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
- end
- end
- neighbors
- end
-
- def db_neighbors params
+ def db_neighbors min_sim: 0.1, dataset_id:
+ #p fingerprints[DEFAULT_FINGERPRINT]
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
#qn = default_fingerprint_size
@@ -329,20 +263,20 @@ module OpenTox
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
#{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
{'$project' => {
- 'tanimoto' => {'$let' => {
+ 'similarity' => {'$let' => {
'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
- #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
}},
'_id' => 1,
- 'features' => 1,
+ #'measurements' => 1,
'dataset_ids' => 1
}},
- {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
- {'$sort' => {'tanimoto' => -1}}
+ {'$match' => {'similarity' => {'$gte' => min_sim}}},
+ {'$sort' => {'similarity' => -1}}
]
-
- $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
+
+ # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
+ $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
end
@@ -353,7 +287,7 @@ module OpenTox
end
# Convert mg to mmol
- # @return [Float] value in mg
+ # @return [Float] value in mmol
def mg_to_mmol mg
mg.to_f/molecular_weight
end
@@ -362,7 +296,7 @@ module OpenTox
# @return [Float] molecular weight
def molecular_weight
mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
- physchem([mw_feature])[mw_feature.id.to_s]
+ calculate_properties([mw_feature]).first
end
private
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 15dfb21..5a05955 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -1,301 +1,135 @@
module OpenTox
- class CrossValidation
- field :validation_ids, type: Array, default: []
- field :model_id, type: BSON::ObjectId
- field :folds, type: Integer
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Array, default: []
- field :finished_at, type: Time
-
- def time
- finished_at - created_at
- end
-
- def validations
- validation_ids.collect{|vid| Validation.find vid}
- end
-
- def model
- Model::Lazar.find model_id
- end
-
- def self.create model, n=10
- model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
- bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
- cv = klass.new(
- name: model.name,
- model_id: model.id,
- folds: n
- )
- cv.save # set created_at
- nr_instances = 0
- nr_unpredicted = 0
- predictions = []
- training_dataset = Dataset.find model.training_dataset_id
- training_dataset.folds(n).each_with_index do |fold,fold_nr|
- #fork do # parallel execution of validations
- $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
- t = Time.now
- validation = Validation.create(model, fold[0], fold[1],cv)
- $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
- #end
- end
- #Process.waitall
- cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
- cv.validations.each do |validation|
- nr_instances += validation.nr_instances
- nr_unpredicted += validation.nr_unpredicted
- predictions += validation.predictions
- end
- cv.update_attributes(
- nr_instances: nr_instances,
- nr_unpredicted: nr_unpredicted,
- predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
- )
- $logger.debug "Nr unpredicted: #{nr_unpredicted}"
- cv.statistics
- cv
- end
- end
-
- class ClassificationCrossValidation < CrossValidation
-
- field :accept_values, type: Array
- field :confusion_matrix, type: Array
- field :weighted_confusion_matrix, type: Array
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash
- field :predictivity, type: Hash
- field :confidence_plot_id, type: BSON::ObjectId
- # TODO auc, f-measure (usability??)
-
- def statistics
- accept_values = Feature.find(model.prediction_feature_id).accept_values
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- true_rate = {}
- predictivity = {}
- predictions.each do |pred|
- compound_id,activities,prediction,confidence = pred
- if activities and prediction #and confidence.numeric?
- if activities.uniq.size == 1
- activity = activities.uniq.first
- if prediction == activity
- if prediction == accept_values[0]
- confusion_matrix[0][0] += 1
- #weighted_confusion_matrix[0][0] += confidence
- elsif prediction == accept_values[1]
- confusion_matrix[1][1] += 1
- #weighted_confusion_matrix[1][1] += confidence
- end
- elsif prediction != activity
- if prediction == accept_values[0]
- confusion_matrix[0][1] += 1
- #weighted_confusion_matrix[0][1] += confidence
- elsif prediction == accept_values[1]
- confusion_matrix[1][0] += 1
- #weighted_confusion_matrix[1][0] += confidence
- end
- end
- end
- else
- nr_unpredicted += 1 if prediction.nil?
+ module Validation
+ class CrossValidation < Validation
+ field :validation_ids, type: Array, default: []
+ field :folds, type: Integer, default: 10
+
+ def self.create model, n=10
+ $logger.debug model.algorithms
+ klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
+ klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
+ bad_request_error "Unknown model class #{model.class}." unless klass
+
+ cv = klass.new(
+ name: model.name,
+ model_id: model.id,
+ folds: n
+ )
+ cv.save # set created_at
+
+ nr_instances = 0
+ nr_unpredicted = 0
+ training_dataset = model.training_dataset
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
+ #fork do # parallel execution of validations can lead to Rserve and memory problems
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+ t = Time.now
+ validation = TrainTest.create(model, fold[0], fold[1])
+ cv.validation_ids << validation.id
+ cv.nr_instances += validation.nr_instances
+ cv.nr_unpredicted += validation.nr_unpredicted
+ #cv.predictions.merge! validation.predictions
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
+ #end
end
+ #Process.waitall
+ cv.save
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+ cv.statistics
+ cv.update_attributes(finished_at: Time.now)
+ cv
end
- true_rate = {}
- predictivity = {}
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+
+ def time
+ finished_at - created_at
end
- confidence_sum = 0
- #weighted_confusion_matrix.each do |r|
- #r.each do |c|
- #confidence_sum += c
- #end
- #end
- update_attributes(
- accept_values: accept_values,
- confusion_matrix: confusion_matrix,
- #weighted_confusion_matrix: weighted_confusion_matrix,
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
- true_rate: true_rate,
- predictivity: predictivity,
- finished_at: Time.now
- )
- $logger.debug "Accuracy #{accuracy}"
- end
- def confidence_plot
- unless confidence_plot_id
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
- accuracies = []
- confidences = []
- correct_predictions = 0
- incorrect_predictions = 0
- predictions.each do |p|
- if p[1] and p[2]
- p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
- confidences << p[3]
+ def validations
+ validation_ids.collect{|vid| TrainTest.find vid}
+ end
- end
- end
- R.assign "accuracy", accuracies
- R.assign "confidence", confidences
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
+ def predictions
+ predictions = {}
+ validations.each{|v| predictions.merge!(v.predictions)}
+ predictions
end
- $gridfs.find_one(_id: confidence_plot_id).data
end
- #Average area under roc 0.646
- #Area under roc 0.646
- #F measure carcinogen: 0.769, noncarcinogen: 0.348
- end
+ class ClassificationCrossValidation < CrossValidation
+ include ClassificationStatistics
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array
+ field :weighted_confusion_matrix, type: Array
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash
+ field :predictivity, type: Hash
+ field :probability_plot_id, type: BSON::ObjectId
+ end
- class RegressionCrossValidation < CrossValidation
+ class RegressionCrossValidation < CrossValidation
+ include RegressionStatistics
+ field :rmse, type: Float, default:0
+ field :mae, type: Float, default:0
+ field :r_squared, type: Float
+ field :within_prediction_interval, type: Integer, default:0
+ field :out_of_prediction_interval, type: Integer, default:0
+ field :correlation_plot_id, type: BSON::ObjectId
+ end
- field :rmse, type: Float
- field :mae, type: Float
- field :r_squared, type: Float
- field :correlation_plot_id, type: BSON::ObjectId
+ class RepeatedCrossValidation < Validation
+ field :crossvalidation_ids, type: Array, default: []
+ field :correlation_plot_id, type: BSON::ObjectId
- def statistics
- rmse = 0
- mae = 0
- x = []
- y = []
- predictions.each do |pred|
- compound_id,activity,prediction,confidence = pred
- if activity and prediction
- unless activity == [nil]
- x << -Math.log10(activity.median)
- y << -Math.log10(prediction)
- error = Math.log10(prediction)-Math.log10(activity.median)
- rmse += error**2
- #weighted_rmse += confidence*error**2
- mae += error.abs
- #weighted_mae += confidence*error.abs
- #confidence_sum += confidence
- end
- else
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ def self.create model, folds=10, repeats=3
+ repeated_cross_validation = self.new
+ repeats.times do |n|
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
end
+ repeated_cross_validation.save
+ repeated_cross_validation
end
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(measurement,prediction,use='complete')"
- r = R.eval("r").to_ruby
- mae = mae/predictions.size
- #weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/predictions.size)
- #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
- update_attributes(
- mae: mae,
- rmse: rmse,
- #weighted_mae: weighted_mae,
- #weighted_rmse: weighted_rmse,
- r_squared: r**2,
- finished_at: Time.now
- )
- $logger.debug "R^2 #{r**2}"
- $logger.debug "RMSE #{rmse}"
- $logger.debug "MAE #{mae}"
- end
+ def crossvalidations
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
+ end
- def misclassifications n=nil
- #n = predictions.size unless n
- n ||= 10
- model = Model::Lazar.find(self.model_id)
- training_dataset = Dataset.find(model.training_dataset_id)
- prediction_feature = training_dataset.features.first
- predictions.collect do |p|
- unless p.include? nil
- compound = Compound.find(p[0])
- neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
- neighbors.collect! do |n|
- neighbor = Compound.find(n[0])
- values = training_dataset.values(neighbor,prediction_feature)
- { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+ def correlation_plot format: "png"
+ #unless correlation_plot_id
+ feature = Feature.find(crossvalidations.first.model.prediction_feature)
+ title = feature.name
+ title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
+ tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+ images = []
+ crossvalidations.each_with_index do |cv,i|
+ x = []
+ y = []
+ cv.predictions.each do |sid,p|
+ x << p["measurements"].median
+ y << p["value"]
+ end
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "all = c(measurement,prediction)"
+ R.eval "range = c(min(all), max(all))"
+ R.eval "image#{i} = qplot(prediction,measurement,main='#{title} #{i}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
+ R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)"
+ images << "image#{i}"
+
+ R.eval "ggsave(file='/home/ist/lazar/test/tmp#{i}.pdf', plot=image#{i})"
end
- {
- :smiles => compound.smiles,
- #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
- :measured => p[1],
- :predicted => p[2],
- #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
- :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
- :relative_error => (p[1]-p[2]).abs/p[1],
- :confidence => p[3],
- :neighbors => neighbors
- }
- end
- end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
- end
-
- def confidence_plot
- tmpfile = "/tmp/#{id.to_s}_confidence.png"
- sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
- R.assign "error", sorted_predictions.collect{|p| p[0]}
- R.assign "confidence", sorted_predictions.collect{|p| p[1]}
- # TODO fix axis names
- R.eval "image = qplot(confidence,error)"
- R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
- $gridfs.find_one(_id: confidence_plot_id).data
- end
-
- def correlation_plot
- unless correlation_plot_id
- tmpfile = "/tmp/#{id.to_s}_correlation.png"
- x = predictions.collect{|p| p[1]}
- y = predictions.collect{|p| p[2]}
- attributes = Model::Lazar.find(self.model_id).attributes
- attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
- attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "all = c(-log(measurement),-log(prediction))"
- R.eval "range = c(min(all), max(all))"
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
- plot_id = $gridfs.insert_one(file)
- update(:correlation_plot_id => plot_id)
- end
+ R.eval "pdf('#{tmpfile}')"
+ R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})"
+ R.eval "dev.off()"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
+ correlation_plot_id = $gridfs.insert_one(file)
+ update(:correlation_plot_id => correlation_plot_id)
+ #end
$gridfs.find_one(_id: correlation_plot_id).data
- end
- end
-
- class RepeatedCrossValidation
- field :crossvalidation_ids, type: Array, default: []
- def self.create model, folds=10, repeats=3
- repeated_cross_validation = self.new
- repeats.times do |n|
- $logger.debug "Crossvalidation #{n+1} for #{model.name}"
- repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
end
- repeated_cross_validation.save
- repeated_cross_validation
- end
- def crossvalidations
- crossvalidation_ids.collect{|id| CrossValidation.find(id)}
end
end
-
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 5d8aeaf..ab55294 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -5,46 +5,49 @@ module OpenTox
class Dataset
- # associations like has_many, belongs_to deteriorate performance
- field :feature_ids, type: Array, default: []
- field :compound_ids, type: Array, default: []
- field :data_entries, type: Array, default: []
- field :source, type: String
+ field :data_entries, type: Hash, default: {}
# Readers
- # Get all compounds
def compounds
- @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
- @compounds
+ substances.select{|s| s.is_a? Compound}
+ end
+
+ def nanoparticles
+ substances.select{|s| s.is_a? Nanoparticle}
+ end
+
+ # Get all substances
+ def substances
+ @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
+ @substances
end
# Get all features
def features
- @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
+ @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
@features
end
- # Find data entry values for a given compound and feature
- # @param compound [OpenTox::Compound] OpenTox Compound object
- # @param feature [OpenTox::Feature] OpenTox Feature object
- # @return [Array] Data entry values
- def values(compound, feature)
- rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
- col = feature_ids.index feature.id
- rows.collect{|row| data_entries[row][col]}
+ def values substance,feature
+ substance = substance.id if substance.is_a? Substance
+ feature = feature.id if feature.is_a? Feature
+ if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
+ data_entries[substance.to_s][feature.to_s]
+ else
+ nil
+ end
end
# Writers
- # Set compounds
- def compounds=(compounds)
- self.compound_ids = compounds.collect{|c| c.id}
- end
-
- # Set features
- def features=(features)
- self.feature_ids = features.collect{|f| f.id}
+ def add(substance,feature,value)
+ substance = substance.id if substance.is_a? Substance
+ feature = feature.id if feature.is_a? Feature
+ data_entries[substance.to_s] ||= {}
+ data_entries[substance.to_s][feature.to_s] ||= []
+ data_entries[substance.to_s][feature.to_s] << value
+ #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source
end
# Dataset operations
@@ -53,13 +56,7 @@ module OpenTox
# @param [Integer] number of folds
# @return [Array] Array with folds [training_dataset,test_dataset]
def folds n
- unique_compound_data = {}
- compound_ids.each_with_index do |cid,i|
- unique_compound_data[cid] ||= []
- unique_compound_data[cid] << data_entries[i]
- end
- unique_compound_ids = unique_compound_data.keys
- len = unique_compound_ids.size
+ len = self.substances.size
indices = (0..len-1).to_a.shuffle
mid = (len/n)
chunks = []
@@ -68,22 +65,16 @@ module OpenTox
last = start+mid
last = last-1 unless len%n >= i
test_idxs = indices[start..last] || []
- test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
+ test_substances = test_idxs.collect{|i| substances[i]}
training_idxs = indices-test_idxs
- training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
- chunk = [training_cids,test_cids].collect do |unique_cids|
- cids = []
- data_entries = []
- unique_cids.each do |cid|
- unique_compound_data[cid].each do |de|
- cids << cid
- data_entries << de
- end
- end
- dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
- dataset.compounds.each do |compound|
- compound.dataset_ids << dataset.id
- compound.save
+ training_substances = training_idxs.collect{|i| substances[i]}
+ chunk = [training_substances,test_substances].collect do |substances|
+ dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
+ substances.each do |substance|
+ substance.dataset_ids << dataset.id
+ substance.dataset_ids.uniq!
+ substance.save
+ dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
end
dataset.save
dataset
@@ -94,41 +85,37 @@ module OpenTox
chunks
end
- # Diagnostics
-
- def duplicates feature=self.features.first
- col = feature_ids.index feature.id
- dups = {}
- compound_ids.each_with_index do |cid,i|
- rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
- values = rows.collect{|row| data_entries[row][col]}
- dups[cid] = values if values.size > 1
- end
- dups
- end
-
- def correlation_plot training_dataset
- # TODO: create/store svg
- R.assign "features", data_entries
- R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
- R.eval "featurePlot(features,activities)"
- end
-
- def density_plot
- # TODO: create/store svg
- R.assign "acts", data_entries.collect{|r| r.first }#.compact
- R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
- end
-
# Serialisation
# converts dataset to csv format including compound smiles as first column, other column headers are feature names
# @return [String]
def to_csv(inchi=false)
- CSV.generate() do |csv| #{:force_quotes=>true}
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
- compounds.each_with_index do |c,i|
- csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
+ CSV.generate() do |csv|
+ compound = substances.first.is_a? Compound
+ if compound
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+ else
+ csv << ["Name"] + features.collect{|f| f.name}
+ end
+ substances.each do |substance|
+ if compound
+ name = (inchi ? substance.inchi : substance.smiles)
+ else
+ name = substance.name
+ end
+ nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
+
+ if nr_measurements.size > 1
+ warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
+ else
+ (0..nr_measurements.first-1).each do |i|
+ row = [name]
+ features.each do |f|
+ values(substance,f) ? row << values(substance,f)[i] : row << ""
+ end
+ csv << row
+ end
+ end
end
end
end
@@ -143,9 +130,8 @@ module OpenTox
#end
# Create a dataset from CSV file
- # TODO: document structure
- def self.from_csv_file file, source=nil, bioassay=true#, layout={}
- source ||= file
+ def self.from_csv_file file, accept_empty_values=false
+ source = file
name = File.basename(file,".*")
dataset = self.find_by(:source => source, :name => name)
if dataset
@@ -154,171 +140,116 @@ module OpenTox
$logger.debug "Parsing #{file}."
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
dataset = self.new(:source => source, :name => name)
- dataset.parse_table table, bioassay#, layout
+ dataset.parse_table table, accept_empty_values
end
dataset
end
# parse data in tabular format (e.g. from csv)
# does a lot of guesswork in order to determine feature types
- def parse_table table, bioassay=true
-
- time = Time.now
+ def parse_table table, accept_empty_values
# features
feature_names = table.shift.collect{|f| f.strip}
- warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
+ warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
compound_format = feature_names.shift.strip
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
-
numeric = []
+ features = []
# guess feature types
feature_names.each_with_index do |f,i|
metadata = {:name => f}
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
types = values.collect{|v| v.numeric? ? true : false}.uniq
+ feature = nil
if values.size == 0 # empty feature
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
- metadata["numeric"] = true
numeric[i] = true
+ feature = NumericFeature.find_or_create_by(metadata)
else
- metadata["nominal"] = true
metadata["accept_values"] = values
numeric[i] = false
+ feature = NominalFeature.find_or_create_by(metadata)
end
- if bioassay
- if metadata["numeric"]
- feature = NumericBioAssay.find_or_create_by(metadata)
- elsif metadata["nominal"]
- feature = NominalBioAssay.find_or_create_by(metadata)
- end
- else
- metadata.merge({:measured => false, :calculated => true})
- if metadata["numeric"]
- feature = NumericFeature.find_or_create_by(metadata)
- elsif metadata["nominal"]
- feature = NominalFeature.find_or_create_by(metadata)
- end
- end
- feature_ids << feature.id if feature
+ features << feature if feature
end
- $logger.debug "Feature values: #{Time.now-time}"
- time = Time.now
-
- r = -1
- compound_time = 0
- value_time = 0
-
- # compounds and values
- self.data_entries = []
+ # substances and values
+ all_substances = []
table.each_with_index do |vals,i|
- ct = Time.now
identifier = vals.shift.strip
- warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
+ warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
begin
case compound_format
when /SMILES/i
- compound = OpenTox::Compound.from_smiles(identifier)
+ substance = OpenTox::Compound.from_smiles(identifier)
when /InChI/i
- compound = OpenTox::Compound.from_inchi(identifier)
+ substance = OpenTox::Compound.from_inchi(identifier)
end
rescue
- compound = nil
+ substance = nil
end
- if compound.nil?
- # compound parsers may return nil
- warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
+ if substance.nil? # compound parsers may return nil
+ warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
next
end
- compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
- compound_time += Time.now-ct
+ all_substances << substance
+ substance.dataset_ids << self.id
+ substance.dataset_ids.uniq!
+ substance.save
- r += 1
- unless vals.size == feature_ids.size # way cheaper than accessing features
- warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
+ unless vals.size == features.size
+ warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
next
end
- compound_ids << compound.id
- table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
-
vals.each_with_index do |v,j|
if v.blank?
- warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
+ warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
next
elsif numeric[j]
v = v.to_f
else
v = v.strip
end
- self.data_entries.last[j] = v
- #i = compound.feature_ids.index feature_ids[j]
- compound.features[feature_ids[j].to_s] ||= []
- compound.features[feature_ids[j].to_s] << v
- compound.save
+ add substance, features[j], v
end
+ data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
end
- compounds.duplicates.each do |compound|
+ all_substances.duplicates.each do |substance|
positions = []
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
- warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
+ warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
-
- $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
- time = Time.now
save
- $logger.debug "Saving: #{Time.now-time}"
-
end
- # Fill unset data entries
- # @param any value
- def fill_nil_with n
- (0 .. compound_ids.size-1).each do |i|
- data_entries[i] ||= []
- (0 .. feature_ids.size-1).each do |j|
- data_entries[i][j] ||= n
- end
- end
+ def delete
+ compounds.each{|c| c.dataset_ids.delete id.to_s}
+ super
end
end
# Dataset for lazar predictions
- class LazarPrediction < Dataset
+ class LazarPrediction #< Dataset
field :creator, type: String
- field :prediction_feature_id, type: String
+ field :prediction_feature_id, type: BSON::ObjectId
+ field :predictions, type: Hash, default: {}
def prediction_feature
Feature.find prediction_feature_id
end
- end
-
- # Dataset for descriptors (physchem)
- class DescriptorDataset < Dataset
- field :feature_calculation_algorithm, type: String
-
- end
-
- class ScaledDataset < DescriptorDataset
-
- field :centers, type: Array, default: []
- field :scales, type: Array, default: []
+ def compounds
+ substances.select{|s| s.is_a? Compound}
+ end
- def original_value value, i
- value * scales[i] + centers[i]
+ def substances
+ predictions.keys.collect{|id| Substance.find id}
end
- end
- # Dataset for fminer descriptors
- class FminerDataset < DescriptorDataset
- field :training_algorithm, type: String
- field :training_dataset_id, type: BSON::ObjectId
- field :training_feature_id, type: BSON::ObjectId
- field :training_parameters, type: Hash
end
end
diff --git a/lib/feature.rb b/lib/feature.rb
index b58946b..0ca4d41 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -2,27 +2,28 @@ module OpenTox
# Basic feature class
class Feature
- field :nominal, type: Boolean
- field :numeric, type: Boolean
field :measured, type: Boolean
field :calculated, type: Boolean
+ field :category, type: String
+ field :unit, type: String
+ field :conditions, type: Hash
+
+ def nominal?
+ self.class == NominalFeature
+ end
+
+ def numeric?
+ self.class == NumericFeature
+ end
end
# Feature for categorical variables
class NominalFeature < Feature
field :accept_values, type: Array
- def initialize params
- super params
- nominal = true
- end
end
# Feature for quantitative variables
class NumericFeature < Feature
- def initialize params
- super params
- numeric = true
- end
end
# Feature for SMARTS fragments
@@ -34,12 +35,4 @@ module OpenTox
end
end
- # Feature for categorical bioassay results
- class NominalBioAssay < NominalFeature
- end
-
- # Feature for quantitative bioassay results
- class NumericBioAssay < NumericFeature
- end
-
end
diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb
new file mode 100644
index 0000000..65f9752
--- /dev/null
+++ b/lib/feature_selection.rb
@@ -0,0 +1,42 @@
+module OpenTox
+ module Algorithm
+
+ class FeatureSelection
+
+ def self.correlation_filter model
+ relevant_features = {}
+ R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
+ model.descriptor_weights = []
+ selected_variables = []
+ selected_descriptor_ids = []
+ model.independent_variables.each_with_index do |v,i|
+ v.collect!{|n| to_r(n)}
+ R.assign "independent", v
+ begin
+ R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
+ selected_variables << v
+ selected_descriptor_ids << model.descriptor_ids[i]
+ end
+ rescue
+ warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
+ end
+ end
+
+ model.independent_variables = selected_variables
+ model.descriptor_ids = selected_descriptor_ids
+ model
+ end
+
+ def self.to_r v
+ return 0 if v == false
+ return 1 if v == true
+ v
+ end
+
+ end
+
+ end
+end
diff --git a/lib/import.rb b/lib/import.rb
new file mode 100644
index 0000000..aa2ee75
--- /dev/null
+++ b/lib/import.rb
@@ -0,0 +1,122 @@
+module OpenTox
+
+ module Import
+
+ class Enanomapper
+ include OpenTox
+
+ # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
+ def self.import dir="."
+ datasets = {}
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+ bundles.each do |bundle|
+ datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
+ $logger.debug bundle["title"]
+ nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
+ nanoparticles.each_with_index do |np,n|
+ core_id = nil
+ coating_ids = []
+ np["composition"].each do |c|
+ uri = c["component"]["compound"]["URI"]
+ uri = CGI.escape File.join(uri,"&media=application/json")
+ data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
+ smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
+ names = []
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
+ if smiles
+ compound = Compound.find_or_create_by(:smiles => smiles)
+ compound.name = names.first
+ compound.names = names.compact
+ else
+ compound = Compound.find_or_create_by(:name => names.first,:names => names)
+ end
+ compound.save
+ if c["relation"] == "HAS_CORE"
+ core_id = compound.id.to_s
+ elsif c["relation"] == "HAS_COATING"
+ coating_ids << compound.id.to_s
+ end
+ end if np["composition"]
+ nanoparticle = Nanoparticle.find_or_create_by(
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+ :source => np["compound"]["URI"],
+ :core_id => core_id,
+ :coating_ids => coating_ids
+ )
+ np["bundles"].keys.each do |bundle_uri|
+ nanoparticle.dataset_ids << datasets[bundle_uri].id
+ end
+
+ studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
+ studies.each do |study|
+ dataset = datasets[np["bundles"].keys.first]
+ proteomics_features = {}
+ category = study["protocol"]["topcategory"]
+ source = study["protocol"]["category"]["term"]
+ study["effects"].each do |effect|
+
+ effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
+ effect["conditions"].delete_if { |k, v| v.nil? }
+
+ if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
+
+ JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
+ proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
+ nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
+ end
+ else
+ name = effect["endpoint"]
+ unit = effect["result"]["unit"]
+ warnings = []
+ case name
+ when "Log2 transformed" # use a sensible name
+ name = "log2(Net cell association)"
+ warnings = ["Original name was 'Log2 transformed'"]
+ unit = "log2(mL/ug(Mg))"
+ when "Total protein (BCA assay)"
+ category = "P-CHEM"
+ warnings = ["Category changed from TOX to P-CHEM"]
+ end
+ feature = klass.find_or_create_by(
+ :name => name,
+ :unit => unit,
+ :category => category,
+ :conditions => effect["conditions"],
+ :source => study["protocol"]["category"]["term"],
+ :measured => true,
+ :warnings => warnings
+ )
+ nanoparticle.parse_ambit_value feature, effect["result"], dataset
+ end
+ end
+ end
+ nanoparticle.save
+ print "#{n}, "
+ end
+ end
+ datasets.each { |u,d| d.save }
+ end
+
+=begin
+ def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
+ #get list of bundle URIs
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+ datasets = []
+ bundles.each do |bundle|
+ uri = bundle["URI"]
+ study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
+ study["@graph"].each do |i|
+ puts i.to_yaml if i.keys.include? "sio:has-value"
+ end
+ end
+ datasets.collect{|d| d.id}
+ end
+=end
+
+ end
+
+ end
+
+end
+
diff --git a/lib/lazar.rb b/lib/lazar.rb
index a28ba3a..f251379 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
R = Rserve::Connection.new
R.eval "
suppressPackageStartupMessages({
+ library(labeling,lib=\"#{rlib}\")
library(iterators,lib=\"#{rlib}\")
library(foreach,lib=\"#{rlib}\")
library(ggplot2,lib=\"#{rlib}\")
@@ -56,12 +57,14 @@ suppressPackageStartupMessages({
library(pls,lib=\"#{rlib}\")
library(caret,lib=\"#{rlib}\")
library(doMC,lib=\"#{rlib}\")
+ library(randomForest,lib=\"#{rlib}\")
+ library(plyr,lib=\"#{rlib}\")
registerDoMC(#{NR_CORES})
})
"
# OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
@@ -70,15 +73,22 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
"opentox.rb",
"feature.rb",
"physchem.rb",
+ "substance.rb",
"compound.rb",
+ "nanoparticle.rb",
"dataset.rb",
"algorithm.rb",
+ "similarity.rb",
+ "feature_selection.rb",
"model.rb",
"classification.rb",
"regression.rb",
+ "caret.rb",
+ "validation-statistics.rb",
"validation.rb",
- "crossvalidation.rb",
+ "train-test-validation.rb",
"leave-one-out-validation.rb",
- "experiment.rb",
+ "crossvalidation.rb",
+ #"experiment.rb",
+ "import.rb",
].each{ |f| require_relative f }
-OpenTox::PhysChem.descriptors # load descriptor features
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 2cd13db..538b7b3 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -1,205 +1,59 @@
module OpenTox
- class LeaveOneOutValidation
-
- field :model_id, type: BSON::ObjectId
- field :dataset_id, type: BSON::ObjectId
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Array
- field :finished_at, type: Time
-
- def self.create model
- model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
- loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
- compound_ids = model.training_dataset.compound_ids
- predictions = model.predict model.training_dataset.compounds
- predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
- predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
- loo.nr_instances = predictions.size
- predictions.select!{|p| p[:value]} # remove unpredicted
- loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
- loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
- loo.statistics
- loo.save
- loo
- end
-
- def model
- Model::Lazar.find model_id
- end
- end
-
- class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
-
- field :accept_values, type: Array
- field :confusion_matrix, type: Array, default: []
- field :weighted_confusion_matrix, type: Array, default: []
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash, default: {}
- field :predictivity, type: Hash, default: {}
- field :confidence_plot_id, type: BSON::ObjectId
-
- def statistics
- accept_values = Feature.find(model.prediction_feature_id).accept_values
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- predictions.each do |pred|
- pred[:database_activities].each do |db_act|
- if pred[:value]
- if pred[:value] == db_act
- if pred[:value] == accept_values[0]
- confusion_matrix[0][0] += 1
- weighted_confusion_matrix[0][0] += pred[:confidence]
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][1] += 1
- weighted_confusion_matrix[1][1] += pred[:confidence]
- end
- else
- if pred[:value] == accept_values[0]
- confusion_matrix[0][1] += 1
- weighted_confusion_matrix[0][1] += pred[:confidence]
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][0] += 1
- weighted_confusion_matrix[1][0] += pred[:confidence]
- end
- end
+ module Validation
+
+ class LeaveOneOut < Validation
+
+ def self.create model
+ bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
+ $logger.debug "#{model.name}: LOO validation started"
+ t = Time.now
+ model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
+ loo = klass.new :model_id => model.id
+ predictions = model.predict model.training_dataset.substances
+ predictions.each{|cid,p| p.delete(:neighbors)}
+ nr_unpredicted = 0
+ predictions.each do |cid,prediction|
+ if prediction[:value]
+ prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
+ else
+ nr_unpredicted += 1
end
+ predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
end
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
+ loo.nr_instances = predictions.size
+ loo.nr_unpredicted = nr_unpredicted
+ loo.predictions = predictions
+ loo.statistics
+ $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
+ loo
end
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
- end
- end
- update_attributes(
- accept_values: accept_values,
- confusion_matrix: confusion_matrix,
- weighted_confusion_matrix: weighted_confusion_matrix,
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
- true_rate: true_rate,
- predictivity: predictivity,
- finished_at: Time.now
- )
- $logger.debug "Accuracy #{accuracy}"
- end
-
- def confidence_plot
- unless confidence_plot_id
- tmpfile = "/tmp/#{id.to_s}_confidence.svg"
- accuracies = []
- confidences = []
- correct_predictions = 0
- incorrect_predictions = 0
- predictions.each do |p|
- p[:database_activities].each do |db_act|
- if p[:value]
- p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
- confidences << p[:confidence]
- end
- end
- end
- R.assign "accuracy", accuracies
- R.assign "confidence", confidences
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
- end
- $gridfs.find_one(_id: confidence_plot_id).data
end
- end
-
-
- class RegressionLeaveOneOutValidation < LeaveOneOutValidation
-
- field :rmse, type: Float, default: 0.0
- field :mae, type: Float, default: 0
- #field :weighted_rmse, type: Float, default: 0
- #field :weighted_mae, type: Float, default: 0
- field :r_squared, type: Float
- field :correlation_plot_id, type: BSON::ObjectId
- field :confidence_plot_id, type: BSON::ObjectId
-
- def statistics
- confidence_sum = 0
- predicted_values = []
- measured_values = []
- predictions.each do |pred|
- pred[:database_activities].each do |activity|
- if pred[:value]
- predicted_values << pred[:value]
- measured_values << activity
- error = Math.log10(pred[:value])-Math.log10(activity)
- self.rmse += error**2
- #self.weighted_rmse += pred[:confidence]*error**2
- self.mae += error.abs
- #self.weighted_mae += pred[:confidence]*error.abs
- #confidence_sum += pred[:confidence]
- end
- end
- if pred[:database_activities].empty?
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- end
- end
- R.assign "measurement", measured_values
- R.assign "prediction", predicted_values
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
- r = R.eval("r").to_ruby
-
- self.mae = self.mae/predictions.size
- #self.weighted_mae = self.weighted_mae/confidence_sum
- self.rmse = Math.sqrt(self.rmse/predictions.size)
- #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
- self.r_squared = r**2
- self.finished_at = Time.now
- save
- $logger.debug "R^2 #{r**2}"
- $logger.debug "RMSE #{rmse}"
- $logger.debug "MAE #{mae}"
+ class ClassificationLeaveOneOut < LeaveOneOut
+ include ClassificationStatistics
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array, default: []
+ field :weighted_confusion_matrix, type: Array, default: []
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash, default: {}
+ field :predictivity, type: Hash, default: {}
+ field :confidence_plot_id, type: BSON::ObjectId
end
-
- def correlation_plot
- unless correlation_plot_id
- tmpfile = "/tmp/#{id.to_s}_correlation.svg"
- predicted_values = []
- measured_values = []
- predictions.each do |pred|
- pred[:database_activities].each do |activity|
- if pred[:value]
- predicted_values << pred[:value]
- measured_values << activity
- end
- end
- end
- attributes = Model::Lazar.find(self.model_id).attributes
- attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
- attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
- R.assign "measurement", measured_values
- R.assign "prediction", predicted_values
- R.eval "all = c(-log(measurement),-log(prediction))"
- R.eval "range = c(min(all), max(all))"
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
- plot_id = $gridfs.insert_one(file)
- update(:correlation_plot_id => plot_id)
- end
- $gridfs.find_one(_id: correlation_plot_id).data
+
+ class RegressionLeaveOneOut < LeaveOneOut
+ include RegressionStatistics
+ field :rmse, type: Float, default: 0
+ field :mae, type: Float, default: 0
+ field :r_squared, type: Float
+ field :within_prediction_interval, type: Integer, default:0
+ field :out_of_prediction_interval, type: Integer, default:0
+ field :correlation_plot_id, type: BSON::ObjectId
end
+
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 8e657b8..38c1915 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -2,7 +2,8 @@ module OpenTox
module Model
- class Model
+ class Lazar
+
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
@@ -10,64 +11,247 @@ module OpenTox
field :name, type: String
field :creator, type: String, default: __FILE__
- # datasets
+ field :algorithms, type: Hash, default:{}
field :training_dataset_id, type: BSON::ObjectId
- # algorithms
- field :prediction_algorithm, type: String
- # prediction feature
+ field :substance_ids, type: Array, default:[]
field :prediction_feature_id, type: BSON::ObjectId
+ field :dependent_variables, type: Array, default:[]
+ field :descriptor_ids, type:Array, default:[]
+ field :independent_variables, type: Array, default:[]
+ field :fingerprints, type: Array, default:[]
+ field :descriptor_weights, type: Array, default:[]
+ field :descriptor_means, type: Array, default:[]
+ field :descriptor_sds, type: Array, default:[]
+ field :scaled_variables, type: Array, default:[]
+ field :version, type: Hash, default:{}
+
+ def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
+ bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
+ prediction_feature = training_dataset.features.first unless prediction_feature
+ # TODO: prediction_feature without training_dataset: use all available data
+
+ # guess model type
+ prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
+
+ model.prediction_feature_id = prediction_feature.id
+ model.training_dataset_id = training_dataset.id
+ model.name = "#{prediction_feature.name} (#{training_dataset.name})"
+ # TODO: check if this works for gem version, add gem versioning?
+ dir = File.dirname(__FILE__)
+ commit = `cd #{dir}; git rev-parse HEAD`.chomp
+ branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp
+ url = `cd #{dir}; git config --get remote.origin.url`.chomp
+ if branch
+ model.version = {:url => url, :branch => branch, :commit => commit}
+ else
+ model.version = {:warning => "git is not installed"}
+ end
- def training_dataset
- Dataset.find(training_dataset_id)
+ # set defaults
+ substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
+ bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
+
+ if substance_classes.first == "OpenTox::Compound"
+
+ model.algorithms = {
+ :descriptors => {
+ :method => "fingerprint",
+ :type => "MP2D",
+ },
+ :similarity => {
+ :method => "Algorithm::Similarity.tanimoto",
+ :min => 0.1
+ },
+ :feature_selection => nil
+ }
+
+ if model.class == LazarClassification
+ model.algorithms[:prediction] = {
+ :method => "Algorithm::Classification.weighted_majority_vote",
+ }
+ elsif model.class == LazarRegression
+ model.algorithms[:prediction] = {
+ :method => "Algorithm::Caret.pls",
+ }
+ end
+
+ elsif substance_classes.first == "OpenTox::Nanoparticle"
+ model.algorithms = {
+ :descriptors => {
+ :method => "properties",
+ :categories => ["P-CHEM"],
+ },
+ :similarity => {
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
+ },
+ :prediction => {
+ :method => "Algorithm::Caret.rf",
+ },
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
+ }
+ else
+ bad_request_error "Cannot create models for #{substance_classes.first}."
+ end
+
+ # overwrite defaults with explicit parameters
+ algorithms.each do |type,parameters|
+ if parameters and parameters.is_a? Hash
+ parameters.each do |p,v|
+ model.algorithms[type] ||= {}
+ model.algorithms[type][p] = v
+ model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type
+ end
+ else
+ model.algorithms[type] = parameters
+ end
+ end if algorithms
+
+ # parse dependent_variables from training dataset
+ training_dataset.substances.each do |substance|
+ values = training_dataset.values(substance,model.prediction_feature_id)
+ values.each do |v|
+ model.substance_ids << substance.id.to_s
+ model.dependent_variables << v
+ end if values
+ end
+
+ descriptor_method = model.algorithms[:descriptors][:method]
+ case descriptor_method
+ # parse fingerprints
+ when "fingerprint"
+ type = model.algorithms[:descriptors][:type]
+ model.substances.each_with_index do |s,i|
+ model.fingerprints[i] ||= []
+ model.fingerprints[i] += s.fingerprint(type)
+ model.fingerprints[i].uniq!
+ end
+ model.descriptor_ids = model.fingerprints.flatten.uniq
+ model.descriptor_ids.each do |d|
+ # resulting model may break BSON size limit (e.g. f Kazius dataset)
+ model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/
+ end
+ # calculate physchem properties
+ when "calculate_properties"
+ features = model.algorithms[:descriptors][:features]
+ model.descriptor_ids = features.collect{|f| f.id.to_s}
+ model.algorithms[:descriptors].delete(:features)
+ model.algorithms[:descriptors].delete(:type)
+ model.substances.each_with_index do |s,i|
+ props = s.calculate_properties(features)
+ props.each_with_index do |v,j|
+ model.independent_variables[j] ||= []
+ model.independent_variables[j][i] = v
+ end if props and !props.empty?
+ end
+ # parse independent_variables
+ when "properties"
+ categories = model.algorithms[:descriptors][:categories]
+ feature_ids = []
+ categories.each do |category|
+ Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
+ end
+ properties = model.substances.collect { |s| s.properties }
+ property_ids = properties.collect{|p| p.keys}.flatten.uniq
+ model.descriptor_ids = feature_ids & property_ids
+ model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
+ else
+ bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
+ end
+
+ if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
+ model = Algorithm.run model.algorithms[:feature_selection][:method], model
+ end
+
+ # scale independent_variables
+ unless model.fingerprints?
+ model.independent_variables.each_with_index do |var,i|
+ model.descriptor_means[i] = var.mean
+ model.descriptor_sds[i] = var.standard_deviation
+ model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil}
+ end
+ end
+ model.save
+ model
end
- end
- class Lazar < Model
-
- # algorithms
- field :neighbor_algorithm, type: String
- field :neighbor_algorithm_parameters, type: Hash, default: {}
-
- # Create a lazar model from a training_dataset and a feature_dataset
- # @param [OpenTox::Dataset] training_dataset
- # @return [OpenTox::Model::Lazar] Regression or classification model
- def initialize training_dataset, params={}
-
- super params
-
- # TODO document convention
- prediction_feature = training_dataset.features.first
- # set defaults for empty parameters
- self.prediction_feature_id ||= prediction_feature.id
- self.training_dataset_id ||= training_dataset.id
- self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
- self.neighbor_algorithm_parameters ||= {}
- self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
- save
- self
- end
-
- def predict_compound compound
- prediction_feature = Feature.find prediction_feature_id
- neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
- # remove neighbors without prediction_feature
- # check for database activities (neighbors may include query compound)
- database_activities = nil
+ def predict_substance substance
+
+ case algorithms[:similarity][:method]
+ when /tanimoto/ # binary features
+ similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
+ # TODO this excludes descriptors only present in the query substance
+ # use for applicability domain?
+ query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
+ when /euclid|cosine/ # quantitative features
+ if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
+ features = descriptor_ids.collect{|id| Feature.find(id)}
+ query_descriptors = substance.calculate_properties(features)
+ similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
+ else
+ similarity_descriptors = []
+ query_descriptors = []
+ descriptor_ids.each_with_index do |id,i|
+ prop = substance.properties[id]
+ prop = prop.median if prop.is_a? Array # measured
+ if prop
+ similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i]
+ query_descriptors[i] = prop
+ end
+ end
+ end
+ else
+ bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
+ end
+
prediction = {}
- if neighbors.collect{|n| n["_id"]}.include? compound.id
+ neighbor_ids = []
+ neighbor_similarities = []
+ neighbor_dependent_variables = []
+ neighbor_independent_variables = []
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
- prediction[:database_activities] = database_activities
- prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
- neighbors.delete_if{|n| n["_id"] == compound.id}
+ prediction = {}
+ # find neighbors
+ substance_ids.each_with_index do |s,i|
+ # handle query substance
+ if substance.id.to_s == s
+ prediction[:measurements] ||= []
+ prediction[:measurements] << dependent_variables[i]
+ prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
+ else
+ if fingerprints?
+ neighbor_descriptors = fingerprints[i]
+ else
+ next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
+ neighbor_descriptors = scaled_variables.collect{|v| v[i]}
+ end
+ sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
+ if sim >= algorithms[:similarity][:min]
+ neighbor_ids << s
+ neighbor_similarities << sim
+ neighbor_dependent_variables << dependent_variables[i]
+ independent_variables.each_with_index do |c,j|
+ neighbor_independent_variables[j] ||= []
+ neighbor_independent_variables[j] << independent_variables[j][i]
+ end
+ end
+ end
end
- neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
- if neighbors.empty?
- prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
+
+ measurements = nil
+
+ if neighbor_similarities.empty?
+ prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
+ elsif neighbor_similarities.size == 1
+ prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
else
- prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
- prediction[:neighbors] = neighbors
- prediction[:neighbors] ||= []
+ query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
+ # call prediction algorithm
+ result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
+ prediction.merge! result
+ prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
end
prediction
end
@@ -77,103 +261,81 @@ module OpenTox
training_dataset = Dataset.find training_dataset_id
# parse data
- compounds = []
- case object.class.to_s
- when "OpenTox::Compound"
- compounds = [object]
- when "Array"
- compounds = object
- when "OpenTox::Dataset"
- compounds = object.compounds
+ substances = []
+ if object.is_a? Substance
+ substances = [object]
+ elsif object.is_a? Array
+ substances = object
+ elsif object.is_a? Dataset
+ substances = object.substances
else
- bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
end
# make predictions
- predictions = []
- predictions = compounds.collect{|c| predict_compound c}
+ predictions = {}
+ substances.each do |c|
+ predictions[c.id.to_s] = predict_substance c
+ predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
+ end
# serialize result
- case object.class.to_s
- when "OpenTox::Compound"
- prediction = predictions.first
+ if object.is_a? Substance
+ prediction = predictions[substances.first.id.to_s]
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
return prediction
- when "Array"
+ elsif object.is_a? Array
return predictions
- when "OpenTox::Dataset"
+ elsif object.is_a? Dataset
# prepare prediction dataset
measurement_feature = Feature.find prediction_feature_id
- prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
- prediction_dataset = LazarPrediction.new(
+ prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
+ prediction_dataset = LazarPrediction.create(
:name => "Lazar prediction for #{prediction_feature.name}",
:creator => __FILE__,
- :prediction_feature_id => prediction_feature.id
-
+ :prediction_feature_id => prediction_feature.id,
+ :predictions => predictions
)
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
- warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
- prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
- prediction_dataset.compounds = compounds
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
- prediction_dataset.save
return prediction_dataset
end
end
-
- def training_activities
- i = training_dataset.feature_ids.index prediction_feature_id
- training_dataset.data_entries.collect{|de| de[i]}
+
+ def training_dataset
+ Dataset.find(training_dataset_id)
+ end
+
+ def prediction_feature
+ Feature.find(prediction_feature_id)
+ end
+
+ def descriptors
+ descriptor_ids.collect{|id| Feature.find(id)}
+ end
+
+ def substances
+ substance_ids.collect{|id| Substance.find(id)}
+ end
+
+ def fingerprints?
+ algorithms[:descriptors][:method] == "fingerprint" ? true : false
end
end
class LazarClassification < Lazar
-
- def self.create training_dataset, params={}
- model = self.new training_dataset, params
- model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
- model.neighbor_algorithm ||= "fingerprint_neighbors"
- model.neighbor_algorithm_parameters ||= {}
- {
- :type => "MP2D",
- :training_dataset_id => training_dataset.id,
- :min_sim => 0.1
- }.each do |key,value|
- model.neighbor_algorithm_parameters[key] ||= value
- end
- model.save
- model
- end
end
class LazarRegression < Lazar
-
- def self.create training_dataset, params={}
- model = self.new training_dataset, params
- model.neighbor_algorithm ||= "fingerprint_neighbors"
- model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
- model.neighbor_algorithm_parameters ||= {}
- {
- :type => "MP2D",
- :training_dataset_id => training_dataset.id,
- :min_sim => 0.1
- }.each do |key,value|
- model.neighbor_algorithm_parameters[key] ||= value
- end
- model.save
- model
- end
end
- class Prediction
+ class Validation
+
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
- # TODO field Validations
field :endpoint, type: String
field :species, type: String
field :source, type: String
@@ -182,7 +344,7 @@ module OpenTox
field :repeated_crossvalidation_id, type: BSON::ObjectId
def predict object
- Lazar.find(model_id).predict object
+ model.predict object
end
def training_dataset
@@ -193,8 +355,17 @@ module OpenTox
Lazar.find model_id
end
+ def algorithms
+ model.algorithms
+ end
+
+ def prediction_feature
+ model.prediction_feature
+ end
+
def repeated_crossvalidation
- RepeatedCrossValidation.find repeated_crossvalidation_id
+ # full class name required
+ OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id
end
def crossvalidations
@@ -202,29 +373,51 @@ module OpenTox
end
def regression?
- training_dataset.features.first.numeric?
+ model.is_a? LazarRegression
end
def classification?
- training_dataset.features.first.nominal?
+ model.is_a? LazarClassification
end
def self.from_csv_file file
metadata_file = file.sub(/csv$/,"json")
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
- prediction_model = self.new JSON.parse(File.read(metadata_file))
+ model_validation = self.new JSON.parse(File.read(metadata_file))
training_dataset = Dataset.from_csv_file file
- model = nil
- if training_dataset.features.first.nominal?
- model = LazarClassification.create training_dataset
- elsif training_dataset.features.first.numeric?
- model = LazarRegression.create training_dataset
+ model = Lazar.create training_dataset: training_dataset
+ model_validation[:model_id] = model.id
+ # full class name required
+ model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id
+ model_validation.save
+ model_validation
+ end
+
+ def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
+
+ # find/import training_dataset
+ training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+ unless training_dataset # try to import from json dump
+ Import::Enanomapper.import
+ training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+ bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
end
- prediction_model[:model_id] = model.id
- prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
- prediction_model.save
- prediction_model
+ prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
+
+ model_validation = self.new(
+ :endpoint => prediction_feature.name,
+ :source => prediction_feature.source,
+ :species => "A549 human lung epithelial carcinoma cells",
+ :unit => prediction_feature.unit
+ )
+ model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms
+ model_validation[:model_id] = model.id
+ repeated_cv = Validation::RepeatedCrossValidation.create model
+ model_validation[:repeated_crossvalidation_id] = repeated_cv.id
+ model_validation.save
+ model_validation
end
+
end
end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
new file mode 100644
index 0000000..02d9a89
--- /dev/null
+++ b/lib/nanoparticle.rb
@@ -0,0 +1,92 @@
+module OpenTox
+
+ class Nanoparticle < Substance
+ include OpenTox
+
+ field :core_id, type: String, default: nil
+ field :coating_ids, type: Array, default: []
+
+ def core
+ Compound.find core_id
+ end
+
+ def coating
+ coating_ids.collect{|i| Compound.find i }
+ end
+
+ def fingerprint type=DEFAULT_FINGERPRINT
+ core_fp = core.fingerprint type
+ coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
+ (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
+ end
+
+ def calculate_properties descriptors=PhysChem::OPENBABEL
+ if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
+ core_prop = core.calculate_properties descriptors
+ coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles}
+ descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]}
+ end
+ end
+
+ def add_feature feature, value, dataset
+ unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
+ case feature.category
+ when "P-CHEM"
+ properties[feature.id.to_s] ||= []
+ properties[feature.id.to_s] << value
+ properties[feature.id.to_s].uniq!
+ when "Proteomics"
+ properties[feature.id.to_s] ||= []
+ properties[feature.id.to_s] << value
+ properties[feature.id.to_s].uniq!
+ when "TOX"
+ dataset.add self, feature, value
+ else
+ warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
+ end
+ dataset_ids << dataset.id
+ dataset_ids.uniq!
+ end
+ end
+
+ def parse_ambit_value feature, v, dataset
+ # TODO add study id to warnings
+ v.delete "unit"
+ # TODO: ppm instead of weights
+ if v.keys == ["textValue"]
+ add_feature feature, v["textValue"], dataset
+ elsif v.keys == ["loValue"]
+ add_feature feature, v["loValue"], dataset
+ elsif v.keys.size == 2 and v["errorValue"]
+ add_feature feature, v["loValue"], dataset
+ #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
+ elsif v.keys.size == 2 and v["loQualifier"] == "mean"
+ add_feature feature, v["loValue"], dataset
+ #warn "'#{feature.name}' is a mean value. Original data is not available."
+ elsif v.keys.size == 2 and v["loQualifier"] #== ">="
+ #warn "Only min value available for '#{feature.name}', entry ignored"
+ elsif v.keys.size == 2 and v["upQualifier"] #== ">="
+ #warn "Only max value available for '#{feature.name}', entry ignored"
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
+ add_feature feature, v["loValue"], dataset
+ #warn "loQualifier and upQualifier are empty."
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
+ add_feature feature, v["loValue"], dataset
+ #warn "loQualifier and upQualifier are empty."
+ elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
+ add_feature feature, v["loValue"], dataset
+ #warn "loQualifier and upQualifier are empty."
+ elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
+ #add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
+ #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
+ elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
+ #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
+ add_feature feature, v["loValue"], dataset
+ elsif v == {} # do nothing
+ else
+ warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
+ end
+ end
+
+ end
+end
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 186c87a..5c300cf 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -1,8 +1,6 @@
module OpenTox
- # Ruby interface
-
- # create default OpenTox classes (defined in opentox-client.rb)
+ # create default OpenTox classes
# provides Mongoid's query and persistence methods
# http://mongoid.org/en/mongoid/docs/persistence.html
# http://mongoid.org/en/mongoid/docs/querying.html
@@ -13,10 +11,15 @@ module OpenTox
include Mongoid::Timestamps
store_in collection: klass.downcase.pluralize
field :name, type: String
+ field :source, type: String
field :warnings, type: Array, default: []
+
+ def warn warning
+ $logger.warn warning
+ warnings << warning
+ end
end
OpenTox.const_set klass,c
end
end
-
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index cef5758..31d30c9 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -28,6 +28,11 @@ class Float
def signif(n)
Float("%.#{n}g" % self)
end
+
+ # converts -10 logarithmized values back
+ def delog10
+ 10**(-1*self)
+ end
end
module Enumerable
@@ -101,19 +106,35 @@ class Array
end
def mean
- self.inject{ |sum, el| sum + el }.to_f / self.size
+ self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
end
def sample_variance
m = self.mean
- sum = self.inject(0){|accum, i| accum +(i-m)**2 }
- sum/(self.length - 1).to_f
+ sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
+ sum/(self.compact.length - 1).to_f
end
def standard_deviation
Math.sqrt(self.sample_variance)
end
+ def for_R
+ if self.first.is_a?(String)
+ #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
+ "NA"
+ else
+ self.median
+ end
+ end
+
+ def collect_with_index
+ result = []
+ self.each_with_index do |elt, idx|
+ result << yield(elt, idx)
+ end
+ result
+ end
end
module URI
diff --git a/lib/physchem.rb b/lib/physchem.rb
index f7b880f..327acd8 100644
--- a/lib/physchem.rb
+++ b/lib/physchem.rb
@@ -14,7 +14,7 @@ module OpenTox
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
- OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+ OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
name,description = d.split(/\s+/,2)
["Openbabel."+name,description] unless obexclude.include? name
end.compact.sort{|a,b| a[0] <=> b[0]}]
@@ -25,24 +25,24 @@ module OpenTox
prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
end
- CDKDESCRIPTORS = cdkdescriptors
+ CDK = cdkdescriptors
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
# strip Joelib messages from stdout
- JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+ JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
name = d[:java_class].sub(/^joelib2.feature.types./,'')
["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
end.compact.sort{|a,b| a[0] <=> b[0]}]
- DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+ DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB))
require_relative "unique_descriptors.rb"
def self.descriptors desc=DESCRIPTORS
desc.collect do |name,description|
lib,desc = name.split('.',2)
- self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
end
end
@@ -54,26 +54,26 @@ module OpenTox
CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
dname = "#{name}.#{n}"
description = DESCRIPTORS[dname]
- udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
end
else
description = DESCRIPTORS[name]
- udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
end
end
udesc
end
def self.openbabel_descriptors
- descriptors OBDESCRIPTORS
+ descriptors OPENBABEL
end
def self.cdk_descriptors
- descriptors CDKDESCRIPTORS
+ descriptors CDK
end
def self.joelib_descriptors
- descriptors JOELIBDESCRIPTORS
+ descriptors JOELIB
end
def calculate compound
@@ -131,3 +131,4 @@ module OpenTox
end
end
+OpenTox::PhysChem.descriptors # load descriptor features
diff --git a/lib/regression.rb b/lib/regression.rb
index 5021fb3..3890987 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,148 +3,18 @@ module OpenTox
class Regression
- def self.local_weighted_average compound, params
+ def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
+ # TODO: prediction_interval
weighted_sum = 0.0
sim_sum = 0.0
- neighbors = params[:neighbors]
- neighbors.each do |row|
- sim = row["tanimoto"]
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- weighted_sum += sim*Math.log10(act)
- sim_sum += sim
- end
- end
- end
- sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+ dependent_variables.each_with_index do |v,i|
+ weighted_sum += weights[i]*dependent_variables[i]
+ sim_sum += weights[i]
+ end if dependent_variables
+ sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
{:value => prediction}
end
- # TODO explicit neighbors, also for physchem
- def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
- neighbors = params[:neighbors]
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
- activities = []
- fingerprints = {}
- weights = []
- fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
-
- neighbors.each_with_index do |row,i|
- neighbor = Compound.find row["_id"]
- fingerprint = neighbor.fingerprint
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- activities << Math.log10(act)
- weights << row["tanimoto"]
- fingerprint_ids.each_with_index do |id,j|
- fingerprints[id] ||= []
- fingerprints[id] << fingerprint.include?(id)
- end
- end
- end
- end
-
- variables = []
- data_frame = [activities]
- fingerprints.each do |k,v|
- unless v.uniq.size == 1
- data_frame << v.collect{|m| m ? "T" : "F"}
- variables << k
- end
- end
-
- if variables.empty?
- result = local_weighted_average(compound, params)
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
- return result
-
- else
- compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
- prediction = r_model_prediction method, data_frame, variables, weights, compound_features
- if prediction.nil? or prediction[:value].nil?
- prediction = local_weighted_average(compound, params)
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
- return prediction
- else
- prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
- prediction[:value] = 10**prediction[:value]
- prediction[:rmse] = 10**prediction[:rmse]
- prediction
- end
- end
-
- end
-
- def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
-
- neighbors = params[:neighbors]
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
- return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
-
- activities = []
- weights = []
- physchem = {}
-
- neighbors.each_with_index do |row,i|
- neighbor = Compound.find row["_id"]
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- activities << Math.log10(act)
- weights << row["tanimoto"] # TODO cosine ?
- neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
- physchem[pid] ||= []
- physchem[pid] << v
- end
- end
- end
- end
-
- # remove properties with a single value
- physchem.each do |pid,v|
- physchem.delete(pid) if v.uniq.size <= 1
- end
-
- if physchem.empty?
- result = local_weighted_average(compound, params)
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
- return result
-
- else
- data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
- prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
- if prediction.nil?
- prediction = local_weighted_average(compound, params)
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
- return prediction
- else
- prediction[:value] = 10**prediction[:value]
- prediction
- end
- end
-
- end
-
- def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
- R.assign "weights", training_weights
- r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
- R.eval "data <- #{r_data_frame}"
- R.assign "features", training_features
- R.eval "names(data) <- append(c('activities'),features)" #
- begin
- R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
- rescue
- return nil
- end
- R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
- R.eval "names(fingerprint) <- features"
- R.eval "prediction <- predict(model,fingerprint)"
- {
- :value => R.eval("prediction").to_f,
- :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
- :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
- }
- end
-
end
end
end
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
index 9321a75..2073be2 100644
--- a/lib/rest-client-wrapper.rb
+++ b/lib/rest-client-wrapper.rb
@@ -55,14 +55,8 @@ module OpenTox
if [301, 302, 307].include? response.code and request.method == :get
response.follow_redirection(request, result)
elsif response.code >= 400 and !URI.task?(uri)
- #TODO add parameters to error-report
- #parameters = request.args
- #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
- #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
- #message += "\nREST parameters:\n#{parameters.inspect}"
error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
begin # errors are returned as error reports in json, try to parse
- # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
content = JSON.parse(response)
msg = content["message"].to_s
cause = content["errorCause"].to_s
diff --git a/lib/similarity.rb b/lib/similarity.rb
new file mode 100644
index 0000000..0901936
--- /dev/null
+++ b/lib/similarity.rb
@@ -0,0 +1,65 @@
+module OpenTox
+ module Algorithm
+
+ class Vector
+ def self.dot_product(a, b)
+ products = a.zip(b).map{|a, b| a * b}
+ products.inject(0) {|s,p| s + p}
+ end
+
+ def self.magnitude(point)
+ squares = point.map{|x| x ** 2}
+ Math.sqrt(squares.inject(0) {|s, c| s + c})
+ end
+ end
+
+ class Similarity
+
+ def self.tanimoto fingerprints
+ ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
+ end
+
+ #def self.weighted_tanimoto fingerprints
+ #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
+ #end
+
+ def self.euclid scaled_properties
+ sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
+ Math.sqrt(sq.inject(0) {|s,c| s + c})
+ end
+
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+ def self.cosine scaled_properties
+ scaled_properties = remove_nils scaled_properties
+ Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
+ end
+
+ def self.weighted_cosine scaled_properties # [a,b,weights]
+ a,b,w = remove_nils scaled_properties
+ return cosine(scaled_properties) if w.uniq.size == 1
+ dot_product = 0
+ magnitude_a = 0
+ magnitude_b = 0
+ (0..a.size-1).each do |i|
+ dot_product += w[i].abs*a[i]*b[i]
+ magnitude_a += w[i].abs*a[i]**2
+ magnitude_b += w[i].abs*b[i]**2
+ end
+ dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
+ end
+
+ def self.remove_nils scaled_properties
+ a =[]; b = []; w = []
+ (0..scaled_properties.first.size-1).each do |i|
+ if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan?
+ a << scaled_properties[0][i]
+ b << scaled_properties[1][i]
+ w << scaled_properties[2][i]
+ end
+ end
+ [a,b,w]
+ end
+
+ end
+ end
+end
diff --git a/lib/substance.rb b/lib/substance.rb
new file mode 100644
index 0000000..31c465e
--- /dev/null
+++ b/lib/substance.rb
@@ -0,0 +1,8 @@
+module OpenTox
+
+ class Substance
+ field :properties, type: Hash, default: {}
+ field :dataset_ids, type: Array, default: []
+ end
+
+end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
new file mode 100644
index 0000000..71abad2
--- /dev/null
+++ b/lib/train-test-validation.rb
@@ -0,0 +1,69 @@
+module OpenTox
+
+ module Validation
+
+ class TrainTest < Validation
+
+ field :training_dataset_id, type: BSON::ObjectId
+ field :test_dataset_id, type: BSON::ObjectId
+
+ def self.create model, training_set, test_set
+
+ validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
+ validation_model.save
+ predictions = validation_model.predict test_set.substances
+ nr_unpredicted = 0
+ predictions.each do |cid,prediction|
+ if prediction[:value]
+ prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
+ else
+ nr_unpredicted += 1
+ end
+ end
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
+ validation = self.new(
+ :model_id => validation_model.id,
+ :test_dataset_id => test_set.id,
+ :nr_instances => test_set.substances.size,
+ :nr_unpredicted => nr_unpredicted,
+ :predictions => predictions
+ )
+ validation.save
+ validation
+ end
+
+ def test_dataset
+ Dataset.find test_dataset_id
+ end
+
+ def training_dataset
+ Dataset.find training_dataset_id
+ end
+
+ end
+
+ class ClassificationTrainTest < TrainTest
+ include ClassificationStatistics
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array
+ field :weighted_confusion_matrix, type: Array
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash
+ field :predictivity, type: Hash
+ field :probability_plot_id, type: BSON::ObjectId
+ end
+
+ class RegressionTrainTest < TrainTest
+ include RegressionStatistics
+ field :rmse, type: Float, default:0
+ field :mae, type: Float, default:0
+ field :r_squared, type: Float
+ field :within_prediction_interval, type: Integer, default:0
+ field :out_of_prediction_interval, type: Integer, default:0
+ field :correlation_plot_id, type: BSON::ObjectId
+ end
+
+ end
+
+end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
new file mode 100644
index 0000000..b6f8a60
--- /dev/null
+++ b/lib/validation-statistics.rb
@@ -0,0 +1,225 @@
+module OpenTox
+ module Validation
+ module ClassificationStatistics
+
+ def statistics
+ self.accept_values = model.prediction_feature.accept_values
+ self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+ self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+ true_rate = {}
+ predictivity = {}
+ nr_instances = 0
+ predictions.each do |cid,pred|
+ # TODO
+ # use predictions without probabilities (single neighbor)??
+ # use measured majority class??
+ if pred[:measurements].uniq.size == 1 and pred[:probabilities]
+ m = pred[:measurements].first
+ if pred[:value] == m
+ if pred[:value] == accept_values[0]
+ confusion_matrix[0][0] += 1
+ weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ elsif pred[:value] == accept_values[1]
+ confusion_matrix[1][1] += 1
+ weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ end
+ elsif pred[:value] != m
+ if pred[:value] == accept_values[0]
+ confusion_matrix[0][1] += 1
+ weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ elsif pred[:value] == accept_values[1]
+ confusion_matrix[1][0] += 1
+ weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ end
+ end
+ end
+ end
+ true_rate = {}
+ predictivity = {}
+ accept_values.each_with_index do |v,i|
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+ end
+ confidence_sum = 0
+ weighted_confusion_matrix.each do |r|
+ r.each do |c|
+ confidence_sum += c
+ end
+ end
+ self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
+ self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+ $logger.debug "Accuracy #{accuracy}"
+ save
+ {
+ :accept_values => accept_values,
+ :confusion_matrix => confusion_matrix,
+ :weighted_confusion_matrix => weighted_confusion_matrix,
+ :accuracy => accuracy,
+ :weighted_accuracy => weighted_accuracy,
+ :true_rate => true_rate,
+ :predictivity => predictivity,
+ }
+ end
+
+ def probability_plot format: "pdf"
+ #unless probability_plot_id
+
+ #tmpdir = File.join(ENV["HOME"], "tmp")
+ tmpdir = "/tmp"
+ #p tmpdir
+ FileUtils.mkdir_p tmpdir
+ tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}")
+ accuracies = []
+ probabilities = []
+ correct_predictions = 0
+ incorrect_predictions = 0
+ pp = []
+ predictions.values.select{|p| p["probabilities"]}.compact.each do |p|
+ p["measurements"].each do |m|
+ pp << [ p["probabilities"][p["value"]], p["value"] == m ]
+ end
+ end
+ pp.sort_by!{|p| 1-p.first}
+ pp.each do |p|
+ p[1] ? correct_predictions += 1 : incorrect_predictions += 1
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+ probabilities << p[0]
+ end
+ R.assign "accuracy", accuracies
+ R.assign "probability", probabilities
+ R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:probability_plot_id => plot_id)
+ #end
+ $gridfs.find_one(_id: probability_plot_id).data
+ end
+ end
+
+ module RegressionStatistics
+
+ def statistics
+ self.rmse = 0
+ self.mae = 0
+ self.within_prediction_interval = 0
+ self.out_of_prediction_interval = 0
+ x = []
+ y = []
+ predictions.each do |cid,pred|
+ if pred[:value] and pred[:measurements]
+ x << pred[:measurements].median
+ y << pred[:value]
+ error = pred[:value]-pred[:measurements].median
+ self.rmse += error**2
+ self.mae += error.abs
+ if pred[:prediction_interval]
+ if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
+ self.within_prediction_interval += 1
+ else
+ self.out_of_prediction_interval += 1
+ end
+ end
+ else
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ end
+ end
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "r <- cor(measurement,prediction,use='pairwise')"
+ self.r_squared = R.eval("r").to_ruby**2
+ self.mae = self.mae/predictions.size
+ self.rmse = Math.sqrt(self.rmse/predictions.size)
+ $logger.debug "R^2 #{r_squared}"
+ $logger.debug "RMSE #{rmse}"
+ $logger.debug "MAE #{mae}"
+ $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
+ save
+ {
+ :mae => mae,
+ :rmse => rmse,
+ :r_squared => r_squared,
+ :within_prediction_interval => within_prediction_interval,
+ :out_of_prediction_interval => out_of_prediction_interval,
+ }
+ end
+
+ def percent_within_prediction_interval
+ 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
+ end
+
+ def correlation_plot format: "png"
+ unless correlation_plot_id
+ tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+ x = []
+ y = []
+ feature = Feature.find(predictions.first.last["prediction_feature_id"])
+ predictions.each do |sid,p|
+ x << p["measurements"].median
+ y << p["value"]
+ end
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "all = c(measurement,prediction)"
+ R.eval "range = c(min(all), max(all))"
+ title = feature.name
+ title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
+ R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
+ plot_id = $gridfs.insert_one(file)
+ update(:correlation_plot_id => plot_id)
+ end
+ $gridfs.find_one(_id: correlation_plot_id).data
+ end
+
+ def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
+ worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
+ worst_predictions.collect do |p|
+ substance = Substance.find(p.first)
+ prediction = p[1]
+ if show_neigbors
+ neighbors = prediction["neighbors"].collect do |n|
+ common_descriptors = []
+ if show_common_descriptors
+ common_descriptors = n["common_descriptors"].collect do |d|
+ f=Feature.find(d)
+ {
+ :id => f.id.to_s,
+ :name => "#{f.name} (#{f.conditions})",
+ :p_value => d[:p_value],
+ :r_squared => d[:r_squared],
+ }
+ end
+ else
+ common_descriptors = n["common_descriptors"].size
+ end
+ {
+ :name => Substance.find(n["_id"]).name,
+ :id => n["_id"].to_s,
+ :common_descriptors => common_descriptors
+ }
+ end
+ else
+ neighbors = prediction["neighbors"].size
+ end
+ {
+ :id => substance.id.to_s,
+ :name => substance.name,
+ :feature => Feature.find(prediction["prediction_feature_id"]).name,
+ :error => (prediction["value"] - prediction["measurements"].median).abs,
+ :prediction => prediction["value"],
+ :measurements => prediction["measurements"],
+ :neighbors => neighbors
+ }
+ end
+ end
+ end
+ end
+end
diff --git a/lib/validation.rb b/lib/validation.rb
index b72d273..ced9596 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -1,108 +1,25 @@
module OpenTox
- class Validation
-
- field :model_id, type: BSON::ObjectId
- field :prediction_dataset_id, type: BSON::ObjectId
- field :crossvalidation_id, type: BSON::ObjectId
- field :test_dataset_id, type: BSON::ObjectId
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Array
-
- def prediction_dataset
- Dataset.find prediction_dataset_id
- end
-
- def test_dataset
- Dataset.find test_dataset_id
- end
-
- def model
- Model::Lazar.find model_id
- end
-
- def self.create model, training_set, test_set, crossvalidation=nil
-
- atts = model.attributes.dup # do not modify attributes from original model
- atts["_id"] = BSON::ObjectId.new
- atts[:training_dataset_id] = training_set.id
- validation_model = model.class.create training_set, atts
- validation_model.save
- cids = test_set.compound_ids
-
- test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
- prediction_dataset = validation_model.predict test_set_without_activities
- predictions = []
- nr_unpredicted = 0
- activities = test_set.data_entries.collect{|de| de.first}
- prediction_dataset.data_entries.each_with_index do |de,i|
- if de[0] #and de[1]
- cid = prediction_dataset.compound_ids[i]
- rows = cids.each_index.select{|r| cids[r] == cid }
- activities = rows.collect{|r| test_set.data_entries[r][0]}
- prediction = de.first
- confidence = de[1]
- predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
- else
- nr_unpredicted += 1
- end
+ module Validation
+
+ class Validation
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: "validations"
+ field :name, type: String
+ field :model_id, type: BSON::ObjectId
+ field :nr_instances, type: Integer, default: 0
+ field :nr_unpredicted, type: Integer, default: 0
+ field :predictions, type: Hash, default: {}
+ field :finished_at, type: Time
+
+ def model
+ Model::Lazar.find model_id
end
- validation = self.new(
- :model_id => validation_model.id,
- :prediction_dataset_id => prediction_dataset.id,
- :test_dataset_id => test_set.id,
- :nr_instances => test_set.compound_ids.size,
- :nr_unpredicted => nr_unpredicted,
- :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
- )
- validation.crossvalidation_id = crossvalidation.id if crossvalidation
- validation.save
- validation
- end
-
- end
-
- class ClassificationValidation < Validation
- end
- class RegressionValidation < Validation
-
- def statistics
- rmse = 0
- weighted_rmse = 0
- rse = 0
- weighted_rse = 0
- mae = 0
- weighted_mae = 0
- confidence_sum = 0
- predictions.each do |pred|
- compound_id,activity,prediction,confidence = pred
- if activity and prediction
- error = Math.log10(prediction)-Math.log10(activity.median)
- rmse += error**2
- weighted_rmse += confidence*error**2
- mae += error.abs
- weighted_mae += confidence*error.abs
- confidence_sum += confidence
- else
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- end
- end
- x = predictions.collect{|p| p[1].median}
- y = predictions.collect{|p| p[2]}
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
- r = R.eval("r").to_ruby
-
- mae = mae/predictions.size
- weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/predictions.size)
- weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
- { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
end
+
end
end