diff options
author | gebele <gebele@in-silico.ch> | 2016-12-12 09:15:48 +0000 |
---|---|---|
committer | gebele <gebele@in-silico.ch> | 2016-12-12 09:15:48 +0000 |
commit | da086fad5b45c0d7b59feb40d0108ac620613933 (patch) | |
tree | 7e9cf8c9332e30552ab255ee9b30e04e904977b4 /lib | |
parent | 32a16d99b51642cac8e75f90c43753d8d05ab770 (diff) | |
parent | 4570f11444bc10da88d849e9a2812e95a8933c8a (diff) |
merged development
Diffstat (limited to 'lib')
-rw-r--r-- | lib/algorithm.rb | 13 | ||||
-rw-r--r-- | lib/caret.rb | 96 | ||||
-rw-r--r-- | lib/classification.rb | 36 | ||||
-rw-r--r-- | lib/compound.rb | 104 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 388 | ||||
-rw-r--r-- | lib/dataset.rb | 279 | ||||
-rw-r--r-- | lib/feature.rb | 29 | ||||
-rw-r--r-- | lib/feature_selection.rb | 42 | ||||
-rw-r--r-- | lib/import.rb | 122 | ||||
-rw-r--r-- | lib/lazar.rb | 18 | ||||
-rw-r--r-- | lib/leave-one-out-validation.rb | 238 | ||||
-rw-r--r-- | lib/model.rb | 449 | ||||
-rw-r--r-- | lib/nanoparticle.rb | 92 | ||||
-rw-r--r-- | lib/opentox.rb | 11 | ||||
-rw-r--r-- | lib/overwrite.rb | 27 | ||||
-rw-r--r-- | lib/physchem.rb | 21 | ||||
-rw-r--r-- | lib/regression.rb | 144 | ||||
-rw-r--r-- | lib/rest-client-wrapper.rb | 6 | ||||
-rw-r--r-- | lib/similarity.rb | 65 | ||||
-rw-r--r-- | lib/substance.rb | 8 | ||||
-rw-r--r-- | lib/train-test-validation.rb | 69 | ||||
-rw-r--r-- | lib/validation-statistics.rb | 225 | ||||
-rw-r--r-- | lib/validation.rb | 117 |
23 files changed, 1428 insertions, 1171 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 113f847..0e4b93a 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -2,18 +2,9 @@ module OpenTox module Algorithm - # Generic method to execute algorithms - # Algorithms should: - # - accept a Compound, an Array of Compounds or a Dataset as first argument - # - optional parameters as second argument - # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values - # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object - # @param [Hash] Algorithm parameters - # @return Algorithm result - def self.run algorithm, object, parameters=nil - bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/ + def self.run algorithm, parameters=nil klass,method = algorithm.split('.') - parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) + Object.const_get(klass).send(method,parameters) end end diff --git a/lib/caret.rb b/lib/caret.rb new file mode 100644 index 0000000..7e4f771 --- /dev/null +++ b/lib/caret.rb @@ -0,0 +1,96 @@ +module OpenTox + module Algorithm + + class Caret + # model list: https://topepo.github.io/caret/modelList.html + + def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables: + remove = [] + # remove independent_variables with single values + independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1} + remove.sort.reverse.each do |i| + independent_variables.delete_at i + query_variables.delete_at i + end + if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + elsif + dependent_variables.size < 3 + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances." + + else + dependent_variables.each_with_index do |v,i| + dependent_variables[i] = to_r(v) + end + independent_variables.each_with_index do |c,i| + c.each_with_index do |v,j| + independent_variables[i][j] = to_r(v) + end + end + query_variables.each_with_index do |v,i| + query_variables[i] = to_r(v) + end + begin + R.assign "weights", weights + r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", (0..independent_variables.size-1).to_a + R.eval "names(data) <- append(c('activities'),features)" # + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" + rescue => e + $logger.debug "R caret model creation error for:" + $logger.debug dependent_variables + $logger.debug independent_variables + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "R caret model creation error. Using weighted average of similar substances." + return prediction + end + begin + R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))" + R.eval "names(query) <- features" + R.eval "prediction <- predict(model,query)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse + prediction = { + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval + } + rescue => e + $logger.debug "R caret prediction error for:" + $logger.debug self.inspect + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "R caret prediction error. Using weighted average of similar substances" + return prediction + end + if prediction.nil? or prediction[:value].nil? + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + end + end + prediction + + end + + # call caret methods dynamically, e.g. Caret.pls + def self.method_missing(sym, *args, &block) + args.first[:method] = sym.to_s + self.create_model_and_predict args.first + end + + def self.to_r v + return "F" if v == false + return "T" if v == true + return nil if v.is_a? Float and v.nan? + v + end + + end + end +end + diff --git a/lib/classification.rb b/lib/classification.rb index 0202940..e8c179f 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,32 +3,24 @@ module OpenTox class Classification - def self.weighted_majority_vote compound, params - neighbors = params[:neighbors] - weighted_sum = {} - sim_sum = 0.0 - confidence = 0.0 - neighbors.each do |row| - sim = row["tanimoto"] - row["features"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum[act] ||= 0 - weighted_sum[act] += sim - end + def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables: + class_weights = {} + dependent_variables.each_with_index do |v,i| + class_weights[v] ||= [] + class_weights[v] << weights[i] unless v.nil? end - case weighted_sum.size - when 1 - return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs} - when 2 - sim_sum = weighted_sum[weighted_sum.keys[0]] - sim_sum -= weighted_sum[weighted_sum.keys[1]] - sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] - confidence = (sim_sum/neighbors.size).abs - return {:value => prediction,:confidence => confidence} - else - bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" + probabilities = {} + class_weights.each do |a,w| + probabilities[a] = w.sum/weights.sum end + probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h + p_max = probabilities.collect{|a,p| p}.max + prediction = probabilities.key(p_max) + {:value => prediction,:probabilities => probabilities} end + end + end end diff --git a/lib/compound.rb b/lib/compound.rb index e002305..8a1143b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,11 +1,9 @@ -CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" +CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/" module OpenTox - class Compound + class Compound < Substance require_relative "unique_descriptors.rb" - include OpenTox - DEFAULT_FINGERPRINT = "MP2D" field :inchi, type: String @@ -19,9 +17,6 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer - field :physchem_descriptors, type: Hash, default: {} - field :dataset_ids, type: Array, default: [] - field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) @@ -80,9 +75,8 @@ module OpenTox fingerprints[type] end - def physchem descriptors=PhysChem.openbabel_descriptors - # TODO: speedup java descriptors - calculated_ids = physchem_descriptors.keys + def calculate_properties descriptors=PhysChem::OPENBABEL + calculated_ids = properties.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -95,11 +89,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + descriptors.collect{|d| properties[d.id.to_s]} end def smarts_match smarts, count=false @@ -142,9 +136,6 @@ module OpenTox # @param inchi [String] smiles InChI string # @return [OpenTox::Compound] Compound def self.from_inchi inchi - # Temporary workaround for OpenBabels Inchi bug - # http://sourceforge.net/p/openbabel/bugs/957/ - # bug has not been fixed in latest git/development version #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") if smiles.empty? @@ -246,7 +237,7 @@ module OpenTox # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem def cid - pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" + pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] self["cid"] end @@ -254,70 +245,13 @@ module OpenTox # @return [String] ChEMBL database compound id, derieved via restcall to chembl def chemblid # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey - uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" + uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] self["chemblid"] end - def fingerprint_count_neighbors params - # TODO fix - neighbors = [] - query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| - unless self == compound - candidate_fingerprint = compound.fingerprint params[:type] - features = (query_fingerprint + candidate_fingerprint).uniq - min_sum = 0 - max_sum = 0 - features.each do |f| - min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax - min_sum += min - max_sum += max - end - max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f - neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] - end - end - neighbors.sort{|a,b| b.last <=> a.last} - end - - def fingerprint_neighbors params - bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] - neighbors = [] - if params[:type] == DEFAULT_FINGERPRINT - neighbors = db_neighbors params - else - query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]) - prediction_feature = training_dataset.features.first - training_dataset.compounds.each do |compound| - candidate_fingerprint = compound.fingerprint params[:type] - sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] - end - neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} - end - neighbors - end - - def physchem_neighbors params - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] - neighbors = [] - feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| - # TODO implement pearson and cosine similarity separatly - R.assign "x", query_fingerprint - R.assign "y", candidate_fingerprint - sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first - if sim >= params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming - end - end - neighbors - end - - def db_neighbors params + def db_neighbors min_sim: 0.1, dataset_id: + #p fingerprints[DEFAULT_FINGERPRINT] # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb #qn = default_fingerprint_size @@ -329,20 +263,20 @@ module OpenTox #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self {'$project' => { - 'tanimoto' => {'$let' => { + 'similarity' => {'$let' => { 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, - #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}}, 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - 'features' => 1, + #'measurements' => 1, 'dataset_ids' => 1 }}, - {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, - {'$sort' => {'tanimoto' => -1}} + {'$match' => {'similarity' => {'$gte' => min_sim}}}, + {'$sort' => {'similarity' => -1}} ] - - $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + + # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array + $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id} end @@ -353,7 +287,7 @@ module OpenTox end # Convert mg to mmol - # @return [Float] value in mg + # @return [Float] value in mmol def mg_to_mmol mg mg.to_f/molecular_weight end @@ -362,7 +296,7 @@ module OpenTox # @return [Float] molecular weight def molecular_weight mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - physchem([mw_feature])[mw_feature.id.to_s] + calculate_properties([mw_feature]).first end private diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15dfb21..5a05955 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -1,301 +1,135 @@ module OpenTox - class CrossValidation - field :validation_ids, type: Array, default: [] - field :model_id, type: BSON::ObjectId - field :folds, type: Integer - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Array, default: [] - field :finished_at, type: Time - - def time - finished_at - created_at - end - - def validations - validation_ids.collect{|vid| Validation.find vid} - end - - def model - Model::Lazar.find model_id - end - - def self.create model, n=10 - model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation - bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass - cv = klass.new( - name: model.name, - model_id: model.id, - folds: n - ) - cv.save # set created_at - nr_instances = 0 - nr_unpredicted = 0 - predictions = [] - training_dataset = Dataset.find model.training_dataset_id - training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = Validation.create(model, fold[0], fold[1],cv) - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - #end - end - #Process.waitall - cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) - cv.validations.each do |validation| - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions - end - cv.update_attributes( - nr_instances: nr_instances, - nr_unpredicted: nr_unpredicted, - predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence - ) - $logger.debug "Nr unpredicted: #{nr_unpredicted}" - cv.statistics - cv - end - end - - class ClassificationCrossValidation < CrossValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId - # TODO auc, f-measure (usability??) - - def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - predictions.each do |pred| - compound_id,activities,prediction,confidence = pred - if activities and prediction #and confidence.numeric? - if activities.uniq.size == 1 - activity = activities.uniq.first - if prediction == activity - if prediction == accept_values[0] - confusion_matrix[0][0] += 1 - #weighted_confusion_matrix[0][0] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][1] += 1 - #weighted_confusion_matrix[1][1] += confidence - end - elsif prediction != activity - if prediction == accept_values[0] - confusion_matrix[0][1] += 1 - #weighted_confusion_matrix[0][1] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][0] += 1 - #weighted_confusion_matrix[1][0] += confidence - end - end - end - else - nr_unpredicted += 1 if prediction.nil? + module Validation + class CrossValidation < Validation + field :validation_ids, type: Array, default: [] + field :folds, type: Integer, default: 10 + + def self.create model, n=10 + $logger.debug model.algorithms + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + + cv = klass.new( + name: model.name, + model_id: model.id, + folds: n + ) + cv.save # set created_at + + nr_instances = 0 + nr_unpredicted = 0 + training_dataset = model.training_dataset + training_dataset.folds(n).each_with_index do |fold,fold_nr| + #fork do # parallel execution of validations can lead to Rserve and memory problems + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = TrainTest.create(model, fold[0], fold[1]) + cv.validation_ids << validation.id + cv.nr_instances += validation.nr_instances + cv.nr_unpredicted += validation.nr_unpredicted + #cv.predictions.merge! validation.predictions + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" + #end end + #Process.waitall + cv.save + $logger.debug "Nr unpredicted: #{nr_unpredicted}" + cv.statistics + cv.update_attributes(finished_at: Time.now) + cv end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + + def time + finished_at - created_at end - confidence_sum = 0 - #weighted_confusion_matrix.each do |r| - #r.each do |c| - #confidence_sum += c - #end - #end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - #weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" - end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.png" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - if p[1] and p[2] - p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[3] + def validations + validation_ids.collect{|vid| TrainTest.find vid} + end - end - end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) + def predictions + predictions = {} + validations.each{|v| predictions.merge!(v.predictions)} + predictions end - $gridfs.find_one(_id: confidence_plot_id).data end - #Average area under roc 0.646 - #Area under roc 0.646 - #F measure carcinogen: 0.769, noncarcinogen: 0.348 - end + class ClassificationCrossValidation < CrossValidation + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :probability_plot_id, type: BSON::ObjectId + end - class RegressionCrossValidation < CrossValidation + class RegressionCrossValidation < CrossValidation + include RegressionStatistics + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 + field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 + field :correlation_plot_id, type: BSON::ObjectId + end - field :rmse, type: Float - field :mae, type: Float - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId + class RepeatedCrossValidation < Validation + field :crossvalidation_ids, type: Array, default: [] + field :correlation_plot_id, type: BSON::ObjectId - def statistics - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - unless activity == [nil] - x << -Math.log10(activity.median) - y << -Math.log10(prediction) - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - #weighted_rmse += confidence*error**2 - mae += error.abs - #weighted_mae += confidence*error.abs - #confidence_sum += confidence - end - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id end + repeated_cross_validation.save + repeated_cross_validation end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" - r = R.eval("r").to_ruby - mae = mae/predictions.size - #weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - update_attributes( - mae: mae, - rmse: rmse, - #weighted_mae: weighted_mae, - #weighted_rmse: weighted_rmse, - r_squared: r**2, - finished_at: Time.now - ) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" - end + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} + end - def misclassifications n=nil - #n = predictions.size unless n - n ||= 10 - model = Model::Lazar.find(self.model_id) - training_dataset = Dataset.find(model.training_dataset_id) - prediction_feature = training_dataset.features.first - predictions.collect do |p| - unless p.include? nil - compound = Compound.find(p[0]) - neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) - neighbors.collect! do |n| - neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + def correlation_plot format: "png" + #unless correlation_plot_id + feature = Feature.find(crossvalidations.first.model.prediction_feature) + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + images = [] + crossvalidations.each_with_index do |cv,i| + x = [] + y = [] + cv.predictions.each do |sid,p| + x << p["measurements"].median + y << p["value"] + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + R.eval "image#{i} = qplot(prediction,measurement,main='#{title} #{i}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)" + images << "image#{i}" + + R.eval "ggsave(file='/home/ist/lazar/test/tmp#{i}.pdf', plot=image#{i})" end - { - :smiles => compound.smiles, - #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, - :measured => p[1], - :predicted => p[2], - #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, - :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, - :relative_error => (p[1]-p[2]).abs/p[1], - :confidence => p[3], - :neighbors => neighbors - } - end - end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] - end - - def confidence_plot - tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact - R.assign "error", sorted_predictions.collect{|p| p[0]} - R.assign "confidence", sorted_predictions.collect{|p| p[1]} - # TODO fix axis names - R.eval "image = qplot(confidence,error)" - R.eval "image = image + stat_smooth(method='lm', se=FALSE)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - $gridfs.find_one(_id: confidence_plot_id).data - end - - def correlation_plot - unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} - attributes = Model::Lazar.find(self.model_id).attributes - attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} - attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - R.assign "measurement", x - R.assign "prediction", y - R.eval "all = c(-log(measurement),-log(prediction))" - R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") - plot_id = $gridfs.insert_one(file) - update(:correlation_plot_id => plot_id) - end + R.eval "pdf('#{tmpfile}')" + R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})" + R.eval "dev.off()" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") + correlation_plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => correlation_plot_id) + #end $gridfs.find_one(_id: correlation_plot_id).data - end - end - - class RepeatedCrossValidation - field :crossvalidation_ids, type: Array, default: [] - def self.create model, folds=10, repeats=3 - repeated_cross_validation = self.new - repeats.times do |n| - $logger.debug "Crossvalidation #{n+1} for #{model.name}" - repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id end - repeated_cross_validation.save - repeated_cross_validation - end - def crossvalidations - crossvalidation_ids.collect{|id| CrossValidation.find(id)} end end - end diff --git a/lib/dataset.rb b/lib/dataset.rb index 5d8aeaf..ab55294 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,46 +5,49 @@ module OpenTox class Dataset - # associations like has_many, belongs_to deteriorate performance - field :feature_ids, type: Array, default: [] - field :compound_ids, type: Array, default: [] - field :data_entries, type: Array, default: [] - field :source, type: String + field :data_entries, type: Hash, default: {} # Readers - # Get all compounds def compounds - @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id} - @compounds + substances.select{|s| s.is_a? Compound} + end + + def nanoparticles + substances.select{|s| s.is_a? Nanoparticle} + end + + # Get all substances + def substances + @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq + @substances end # Get all features def features - @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq @features end - # Find data entry values for a given compound and feature - # @param compound [OpenTox::Compound] OpenTox Compound object - # @param feature [OpenTox::Feature] OpenTox Feature object - # @return [Array] Data entry values - def values(compound, feature) - rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id } - col = feature_ids.index feature.id - rows.collect{|row| data_entries[row][col]} + def values substance,feature + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] + data_entries[substance.to_s][feature.to_s] + else + nil + end end # Writers - # Set compounds - def compounds=(compounds) - self.compound_ids = compounds.collect{|c| c.id} - end - - # Set features - def features=(features) - self.feature_ids = features.collect{|f| f.id} + def add(substance,feature,value) + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + data_entries[substance.to_s] ||= {} + data_entries[substance.to_s][feature.to_s] ||= [] + data_entries[substance.to_s][feature.to_s] << value + #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source end # Dataset operations @@ -53,13 +56,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - unique_compound_data = {} - compound_ids.each_with_index do |cid,i| - unique_compound_data[cid] ||= [] - unique_compound_data[cid] << data_entries[i] - end - unique_compound_ids = unique_compound_data.keys - len = unique_compound_ids.size + len = self.substances.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -68,22 +65,16 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| unique_compound_ids[i]} + test_substances = test_idxs.collect{|i| substances[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| unique_compound_ids[i]} - chunk = [training_cids,test_cids].collect do |unique_cids| - cids = [] - data_entries = [] - unique_cids.each do |cid| - unique_compound_data[cid].each do |de| - cids << cid - data_entries << de - end - end - dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) - dataset.compounds.each do |compound| - compound.dataset_ids << dataset.id - compound.save + training_substances = training_idxs.collect{|i| substances[i]} + chunk = [training_substances,test_substances].collect do |substances| + dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) + substances.each do |substance| + substance.dataset_ids << dataset.id + substance.dataset_ids.uniq! + substance.save + dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end dataset.save dataset @@ -94,41 +85,37 @@ module OpenTox chunks end - # Diagnostics - - def duplicates feature=self.features.first - col = feature_ids.index feature.id - dups = {} - compound_ids.each_with_index do |cid,i| - rows = compound_ids.each_index.select{|r| compound_ids[r] == cid } - values = rows.collect{|row| data_entries[row][col]} - dups[cid] = values if values.size > 1 - end - dups - end - - def correlation_plot training_dataset - # TODO: create/store svg - R.assign "features", data_entries - R.assign "activities", training_dataset.data_entries.collect{|de| de.first} - R.eval "featurePlot(features,activities)" - end - - def density_plot - # TODO: create/store svg - R.assign "acts", data_entries.collect{|r| r.first }#.compact - R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')" - end - # Serialisation # converts dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] def to_csv(inchi=false) - CSV.generate() do |csv| #{:force_quotes=>true} - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - compounds.each_with_index do |c,i| - csv << [inchi ? c.inchi : c.smiles] + data_entries[i] + CSV.generate() do |csv| + compound = substances.first.is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end + substances.each do |substance| + if compound + name = (inchi ? substance.inchi : substance.smiles) + else + name = substance.name + end + nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq + + if nr_measurements.size > 1 + warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + else + (0..nr_measurements.first-1).each do |i| + row = [name] + features.each do |f| + values(substance,f) ? row << values(substance,f)[i] : row << "" + end + csv << row + end + end end end end @@ -143,9 +130,8 @@ module OpenTox #end # Create a dataset from CSV file - # TODO: document structure - def self.from_csv_file file, source=nil, bioassay=true#, layout={} - source ||= file + def self.from_csv_file file, accept_empty_values=false + source = file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset @@ -154,171 +140,116 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table, bioassay#, layout + dataset.parse_table table, accept_empty_values end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table, bioassay=true - - time = Time.now + def parse_table table, accept_empty_values # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - numeric = [] + features = [] # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - metadata["numeric"] = true numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) else - metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) end - if bioassay - if metadata["numeric"] - feature = NumericBioAssay.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalBioAssay.find_or_create_by(metadata) - end - else - metadata.merge({:measured => false, :calculated => true}) - if metadata["numeric"] - feature = NumericFeature.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalFeature.find_or_create_by(metadata) - end - end - feature_ids << feature.id if feature + features << feature if feature end - $logger.debug "Feature values: #{Time.now-time}" - time = Time.now - - r = -1 - compound_time = 0 - value_time = 0 - - # compounds and values - self.data_entries = [] + # substances and values + all_substances = [] table.each_with_index do |vals,i| - ct = Time.now identifier = vals.shift.strip - warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values begin case compound_format when /SMILES/i - compound = OpenTox::Compound.from_smiles(identifier) + substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i - compound = OpenTox::Compound.from_inchi(identifier) + substance = OpenTox::Compound.from_inchi(identifier) end rescue - compound = nil + substance = nil end - if compound.nil? - # compound parsers may return nil - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + if substance.nil? # compound parsers may return nil + warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end - compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id - compound_time += Time.now-ct + all_substances << substance + substance.dataset_ids << self.id + substance.dataset_ids.uniq! + substance.save - r += 1 - unless vals.size == feature_ids.size # way cheaper than accessing features - warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." + unless vals.size == features.size + warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end - compound_ids << compound.id - table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1) - vals.each_with_index do |v,j| if v.blank? - warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." next elsif numeric[j] v = v.to_f else v = v.strip end - self.data_entries.last[j] = v - #i = compound.feature_ids.index feature_ids[j] - compound.features[feature_ids[j].to_s] ||= [] - compound.features[feature_ids[j].to_s] << v - compound.save + add substance, features[j], v end + data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values end - compounds.duplicates.each do |compound| + all_substances.duplicates.each do |substance| positions = [] - compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} + warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end - - $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" - time = Time.now save - $logger.debug "Saving: #{Time.now-time}" - end - # Fill unset data entries - # @param any value - def fill_nil_with n - (0 .. compound_ids.size-1).each do |i| - data_entries[i] ||= [] - (0 .. feature_ids.size-1).each do |j| - data_entries[i][j] ||= n - end - end + def delete + compounds.each{|c| c.dataset_ids.delete id.to_s} + super end end # Dataset for lazar predictions - class LazarPrediction < Dataset + class LazarPrediction #< Dataset field :creator, type: String - field :prediction_feature_id, type: String + field :prediction_feature_id, type: BSON::ObjectId + field :predictions, type: Hash, default: {} def prediction_feature Feature.find prediction_feature_id end - end - - # Dataset for descriptors (physchem) - class DescriptorDataset < Dataset - field :feature_calculation_algorithm, type: String - - end - - class ScaledDataset < DescriptorDataset - - field :centers, type: Array, default: [] - field :scales, type: Array, default: [] + def compounds + substances.select{|s| s.is_a? Compound} + end - def original_value value, i - value * scales[i] + centers[i] + def substances + predictions.keys.collect{|id| Substance.find id} end - end - # Dataset for fminer descriptors - class FminerDataset < DescriptorDataset - field :training_algorithm, type: String - field :training_dataset_id, type: BSON::ObjectId - field :training_feature_id, type: BSON::ObjectId - field :training_parameters, type: Hash end end diff --git a/lib/feature.rb b/lib/feature.rb index b58946b..0ca4d41 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -2,27 +2,28 @@ module OpenTox # Basic feature class class Feature - field :nominal, type: Boolean - field :numeric, type: Boolean field :measured, type: Boolean field :calculated, type: Boolean + field :category, type: String + field :unit, type: String + field :conditions, type: Hash + + def nominal? + self.class == NominalFeature + end + + def numeric? + self.class == NumericFeature + end end # Feature for categorical variables class NominalFeature < Feature field :accept_values, type: Array - def initialize params - super params - nominal = true - end end # Feature for quantitative variables class NumericFeature < Feature - def initialize params - super params - numeric = true - end end # Feature for SMARTS fragments @@ -34,12 +35,4 @@ module OpenTox end end - # Feature for categorical bioassay results - class NominalBioAssay < NominalFeature - end - - # Feature for quantitative bioassay results - class NumericBioAssay < NumericFeature - end - end diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb new file mode 100644 index 0000000..65f9752 --- /dev/null +++ b/lib/feature_selection.rb @@ -0,0 +1,42 @@ +module OpenTox + module Algorithm + + class FeatureSelection + + def self.correlation_filter model + relevant_features = {} + R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)} + model.descriptor_weights = [] + selected_variables = [] + selected_descriptor_ids = [] + model.independent_variables.each_with_index do |v,i| + v.collect!{|n| to_r(n)} + R.assign "independent", v + begin + R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + model.descriptor_weights << R.eval("cor$estimate").to_ruby**2 + selected_variables << v + selected_descriptor_ids << model.descriptor_ids[i] + end + rescue + warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed." + end + end + + model.independent_variables = selected_variables + model.descriptor_ids = selected_descriptor_ids + model + end + + def self.to_r v + return 0 if v == false + return 1 if v == true + v + end + + end + + end +end diff --git a/lib/import.rb b/lib/import.rb new file mode 100644 index 0000000..aa2ee75 --- /dev/null +++ b/lib/import.rb @@ -0,0 +1,122 @@ +module OpenTox + + module Import + + class Enanomapper + include OpenTox + + # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) + def self.import dir="." + datasets = {} + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + bundles.each do |bundle| + datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) + $logger.debug bundle["title"] + nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] + nanoparticles.each_with_index do |np,n| + core_id = nil + coating_ids = [] + np["composition"].each do |c| + uri = c["component"]["compound"]["URI"] + uri = CGI.escape File.join(uri,"&media=application/json") + data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") + smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] + names = [] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] + if smiles + compound = Compound.find_or_create_by(:smiles => smiles) + compound.name = names.first + compound.names = names.compact + else + compound = Compound.find_or_create_by(:name => names.first,:names => names) + end + compound.save + if c["relation"] == "HAS_CORE" + core_id = compound.id.to_s + elsif c["relation"] == "HAS_COATING" + coating_ids << compound.id.to_s + end + end if np["composition"] + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + :core_id => core_id, + :coating_ids => coating_ids + ) + np["bundles"].keys.each do |bundle_uri| + nanoparticle.dataset_ids << datasets[bundle_uri].id + end + + studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + studies.each do |study| + dataset = datasets[np["bundles"].keys.first] + proteomics_features = {} + category = study["protocol"]["topcategory"] + source = study["protocol"]["category"]["term"] + study["effects"].each do |effect| + + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + effect["conditions"].delete_if { |k, v| v.nil? } + + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data + + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) + nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset + end + else + name = effect["endpoint"] + unit = effect["result"]["unit"] + warnings = [] + case name + when "Log2 transformed" # use a sensible name + name = "log2(Net cell association)" + warnings = ["Original name was 'Log2 transformed'"] + unit = "log2(mL/ug(Mg))" + when "Total protein (BCA assay)" + category = "P-CHEM" + warnings = ["Category changed from TOX to P-CHEM"] + end + feature = klass.find_or_create_by( + :name => name, + :unit => unit, + :category => category, + :conditions => effect["conditions"], + :source => study["protocol"]["category"]["term"], + :measured => true, + :warnings => warnings + ) + nanoparticle.parse_ambit_value feature, effect["result"], dataset + end + end + end + nanoparticle.save + print "#{n}, " + end + end + datasets.each { |u,d| d.save } + end + +=begin + def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries + #get list of bundle URIs + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + datasets = [] + bundles.each do |bundle| + uri = bundle["URI"] + study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`) + study["@graph"].each do |i| + puts i.to_yaml if i.keys.include? "sio:has-value" + end + end + datasets.collect{|d| d.id} + end +=end + + end + + end + +end + diff --git a/lib/lazar.rb b/lib/lazar.rb index a28ba3a..f251379 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i R = Rserve::Connection.new R.eval " suppressPackageStartupMessages({ + library(labeling,lib=\"#{rlib}\") library(iterators,lib=\"#{rlib}\") library(foreach,lib=\"#{rlib}\") library(ggplot2,lib=\"#{rlib}\") @@ -56,12 +57,14 @@ suppressPackageStartupMessages({ library(pls,lib=\"#{rlib}\") library(caret,lib=\"#{rlib}\") library(doMC,lib=\"#{rlib}\") + library(randomForest,lib=\"#{rlib}\") + library(plyr,lib=\"#{rlib}\") registerDoMC(#{NR_CORES}) }) " # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -70,15 +73,22 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO "opentox.rb", "feature.rb", "physchem.rb", + "substance.rb", "compound.rb", + "nanoparticle.rb", "dataset.rb", "algorithm.rb", + "similarity.rb", + "feature_selection.rb", "model.rb", "classification.rb", "regression.rb", + "caret.rb", + "validation-statistics.rb", "validation.rb", - "crossvalidation.rb", + "train-test-validation.rb", "leave-one-out-validation.rb", - "experiment.rb", + "crossvalidation.rb", + #"experiment.rb", + "import.rb", ].each{ |f| require_relative f } -OpenTox::PhysChem.descriptors # load descriptor features diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 2cd13db..538b7b3 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -1,205 +1,59 @@ module OpenTox - class LeaveOneOutValidation - - field :model_id, type: BSON::ObjectId - field :dataset_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Array - field :finished_at, type: Time - - def self.create model - model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation - loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id - compound_ids = model.training_dataset.compound_ids - predictions = model.predict model.training_dataset.compounds - predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]} - predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?} - loo.nr_instances = predictions.size - predictions.select!{|p| p[:value]} # remove unpredicted - loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]} - loo.nr_unpredicted = loo.nr_instances - loo.predictions.size - loo.statistics - loo.save - loo - end - - def model - Model::Lazar.find model_id - end - end - - class ClassificationLeaveOneOutValidation < LeaveOneOutValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array, default: [] - field :weighted_confusion_matrix, type: Array, default: [] - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash, default: {} - field :predictivity, type: Hash, default: {} - field :confidence_plot_id, type: BSON::ObjectId - - def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - predictions.each do |pred| - pred[:database_activities].each do |db_act| - if pred[:value] - if pred[:value] == db_act - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:confidence] - end - else - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:confidence] - end - end + module Validation + + class LeaveOneOut < Validation + + def self.create model + bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] + $logger.debug "#{model.name}: LOO validation started" + t = Time.now + model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut + loo = klass.new :model_id => model.id + predictions = model.predict model.training_dataset.substances + predictions.each{|cid,p| p.delete(:neighbors)} + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) + else + nr_unpredicted += 1 end + predictions.delete(cid) unless prediction[:value] and prediction[:measurements] end + predictions.select!{|cid,p| p[:value] and p[:measurements]} + loo.nr_instances = predictions.size + loo.nr_unpredicted = nr_unpredicted + loo.predictions = predictions + loo.statistics + $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" + loo end - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c - end - end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" - end - - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.svg" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - p[:database_activities].each do |db_act| - if p[:value] - p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[:confidence] - end - end - end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - end - $gridfs.find_one(_id: confidence_plot_id).data end - end - - - class RegressionLeaveOneOutValidation < LeaveOneOutValidation - - field :rmse, type: Float, default: 0.0 - field :mae, type: Float, default: 0 - #field :weighted_rmse, type: Float, default: 0 - #field :weighted_mae, type: Float, default: 0 - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId - field :confidence_plot_id, type: BSON::ObjectId - - def statistics - confidence_sum = 0 - predicted_values = [] - measured_values = [] - predictions.each do |pred| - pred[:database_activities].each do |activity| - if pred[:value] - predicted_values << pred[:value] - measured_values << activity - error = Math.log10(pred[:value])-Math.log10(activity) - self.rmse += error**2 - #self.weighted_rmse += pred[:confidence]*error**2 - self.mae += error.abs - #self.weighted_mae += pred[:confidence]*error.abs - #confidence_sum += pred[:confidence] - end - end - if pred[:database_activities].empty? - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", measured_values - R.assign "prediction", predicted_values - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - self.mae = self.mae/predictions.size - #self.weighted_mae = self.weighted_mae/confidence_sum - self.rmse = Math.sqrt(self.rmse/predictions.size) - #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum) - self.r_squared = r**2 - self.finished_at = Time.now - save - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + class ClassificationLeaveOneOut < LeaveOneOut + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array, default: [] + field :weighted_confusion_matrix, type: Array, default: [] + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash, default: {} + field :predictivity, type: Hash, default: {} + field :confidence_plot_id, type: BSON::ObjectId end - - def correlation_plot - unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.svg" - predicted_values = [] - measured_values = [] - predictions.each do |pred| - pred[:database_activities].each do |activity| - if pred[:value] - predicted_values << pred[:value] - measured_values << activity - end - end - end - attributes = Model::Lazar.find(self.model_id).attributes - attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} - attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - R.assign "measurement", measured_values - R.assign "prediction", predicted_values - R.eval "all = c(-log(measurement),-log(prediction))" - R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") - plot_id = $gridfs.insert_one(file) - update(:correlation_plot_id => plot_id) - end - $gridfs.find_one(_id: correlation_plot_id).data + + class RegressionLeaveOneOut < LeaveOneOut + include RegressionStatistics + field :rmse, type: Float, default: 0 + field :mae, type: Float, default: 0 + field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 + field :correlation_plot_id, type: BSON::ObjectId end + end end diff --git a/lib/model.rb b/lib/model.rb index 8e657b8..38c1915 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -2,7 +2,8 @@ module OpenTox module Model - class Model + class Lazar + include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -10,64 +11,247 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ - # datasets + field :algorithms, type: Hash, default:{} field :training_dataset_id, type: BSON::ObjectId - # algorithms - field :prediction_algorithm, type: String - # prediction feature + field :substance_ids, type: Array, default:[] field :prediction_feature_id, type: BSON::ObjectId + field :dependent_variables, type: Array, default:[] + field :descriptor_ids, type:Array, default:[] + field :independent_variables, type: Array, default:[] + field :fingerprints, type: Array, default:[] + field :descriptor_weights, type: Array, default:[] + field :descriptor_means, type: Array, default:[] + field :descriptor_sds, type: Array, default:[] + field :scaled_variables, type: Array, default:[] + field :version, type: Hash, default:{} + + def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} + bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset + prediction_feature = training_dataset.features.first unless prediction_feature + # TODO: prediction_feature without training_dataset: use all available data + + # guess model type + prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new + + model.prediction_feature_id = prediction_feature.id + model.training_dataset_id = training_dataset.id + model.name = "#{prediction_feature.name} (#{training_dataset.name})" + # TODO: check if this works for gem version, add gem versioning? + dir = File.dirname(__FILE__) + commit = `cd #{dir}; git rev-parse HEAD`.chomp + branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp + url = `cd #{dir}; git config --get remote.origin.url`.chomp + if branch + model.version = {:url => url, :branch => branch, :commit => commit} + else + model.version = {:warning => "git is not installed"} + end - def training_dataset - Dataset.find(training_dataset_id) + # set defaults + substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq + bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 + + if substance_classes.first == "OpenTox::Compound" + + model.algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :feature_selection => nil + } + + if model.class == LazarClassification + model.algorithms[:prediction] = { + :method => "Algorithm::Classification.weighted_majority_vote", + } + elsif model.class == LazarRegression + model.algorithms[:prediction] = { + :method => "Algorithm::Caret.pls", + } + end + + elsif substance_classes.first == "OpenTox::Nanoparticle" + model.algorithms = { + :descriptors => { + :method => "properties", + :categories => ["P-CHEM"], + }, + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + :prediction => { + :method => "Algorithm::Caret.rf", + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + else + bad_request_error "Cannot create models for #{substance_classes.first}." + end + + # overwrite defaults with explicit parameters + algorithms.each do |type,parameters| + if parameters and parameters.is_a? Hash + parameters.each do |p,v| + model.algorithms[type] ||= {} + model.algorithms[type][p] = v + model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type + end + else + model.algorithms[type] = parameters + end + end if algorithms + + # parse dependent_variables from training dataset + training_dataset.substances.each do |substance| + values = training_dataset.values(substance,model.prediction_feature_id) + values.each do |v| + model.substance_ids << substance.id.to_s + model.dependent_variables << v + end if values + end + + descriptor_method = model.algorithms[:descriptors][:method] + case descriptor_method + # parse fingerprints + when "fingerprint" + type = model.algorithms[:descriptors][:type] + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! + end + model.descriptor_ids = model.fingerprints.flatten.uniq + model.descriptor_ids.each do |d| + # resulting model may break BSON size limit (e.g. f Kazius dataset) + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ + end + # calculate physchem properties + when "calculate_properties" + features = model.algorithms[:descriptors][:features] + model.descriptor_ids = features.collect{|f| f.id.to_s} + model.algorithms[:descriptors].delete(:features) + model.algorithms[:descriptors].delete(:type) + model.substances.each_with_index do |s,i| + props = s.calculate_properties(features) + props.each_with_index do |v,j| + model.independent_variables[j] ||= [] + model.independent_variables[j][i] = v + end if props and !props.empty? + end + # parse independent_variables + when "properties" + categories = model.algorithms[:descriptors][:categories] + feature_ids = [] + categories.each do |category| + Feature.where(category:category).each{|f| feature_ids << f.id.to_s} + end + properties = model.substances.collect { |s| s.properties } + property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = feature_ids & property_ids + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + else + bad_request_error "Descriptor method '#{descriptor_method}' not implemented." + end + + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] + model = Algorithm.run model.algorithms[:feature_selection][:method], model + end + + # scale independent_variables + unless model.fingerprints? + model.independent_variables.each_with_index do |var,i| + model.descriptor_means[i] = var.mean + model.descriptor_sds[i] = var.standard_deviation + model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} + end + end + model.save + model end - end - class Lazar < Model - - # algorithms - field :neighbor_algorithm, type: String - field :neighbor_algorithm_parameters, type: Hash, default: {} - - # Create a lazar model from a training_dataset and a feature_dataset - # @param [OpenTox::Dataset] training_dataset - # @return [OpenTox::Model::Lazar] Regression or classification model - def initialize training_dataset, params={} - - super params - - # TODO document convention - prediction_feature = training_dataset.features.first - # set defaults for empty parameters - self.prediction_feature_id ||= prediction_feature.id - self.training_dataset_id ||= training_dataset.id - self.name ||= "#{training_dataset.name} #{prediction_feature.name}" - self.neighbor_algorithm_parameters ||= {} - self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id - save - self - end - - def predict_compound compound - prediction_feature = Feature.find prediction_feature_id - neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) - # remove neighbors without prediction_feature - # check for database activities (neighbors may include query compound) - database_activities = nil + def predict_substance substance + + case algorithms[:similarity][:method] + when /tanimoto/ # binary features + similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] + # TODO this excludes descriptors only present in the query substance + # use for applicability domain? + query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} + when /euclid|cosine/ # quantitative features + if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors + features = descriptor_ids.collect{|id| Feature.find(id)} + query_descriptors = substance.calculate_properties(features) + similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} + else + similarity_descriptors = [] + query_descriptors = [] + descriptor_ids.each_with_index do |id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + if prop + similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] + query_descriptors[i] = prop + end + end + end + else + bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." + end + prediction = {} - if neighbors.collect{|n| n["_id"]}.include? compound.id + neighbor_ids = [] + neighbor_similarities = [] + neighbor_dependent_variables = [] + neighbor_independent_variables = [] - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq - prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." - neighbors.delete_if{|n| n["_id"] == compound.id} + prediction = {} + # find neighbors + substance_ids.each_with_index do |s,i| + # handle query substance + if substance.id.to_s == s + prediction[:measurements] ||= [] + prediction[:measurements] << dependent_variables[i] + prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." + else + if fingerprints? + neighbor_descriptors = fingerprints[i] + else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions + neighbor_descriptors = scaled_variables.collect{|v| v[i]} + end + sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] + if sim >= algorithms[:similarity][:min] + neighbor_ids << s + neighbor_similarities << sim + neighbor_dependent_variables << dependent_variables[i] + independent_variables.each_with_index do |c,j| + neighbor_independent_variables[j] ||= [] + neighbor_independent_variables[j] << independent_variables[j][i] + end + end + end end - neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } - if neighbors.empty? - prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) + + measurements = nil + + if neighbor_similarities.empty? + prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbor_similarities.size == 1 + prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else - prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id})) - prediction[:neighbors] = neighbors - prediction[:neighbors] ||= [] + query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" + # call prediction algorithm + result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors + prediction.merge! result + prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end prediction end @@ -77,103 +261,81 @@ module OpenTox training_dataset = Dataset.find training_dataset_id # parse data - compounds = [] - case object.class.to_s - when "OpenTox::Compound" - compounds = [object] - when "Array" - compounds = object - when "OpenTox::Dataset" - compounds = object.compounds + substances = [] + if object.is_a? Substance + substances = [object] + elsif object.is_a? Array + substances = object + elsif object.is_a? Dataset + substances = object.substances else - bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions - predictions = [] - predictions = compounds.collect{|c| predict_compound c} + predictions = {} + substances.each do |c| + predictions[c.id.to_s] = predict_substance c + predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id + end # serialize result - case object.class.to_s - when "OpenTox::Compound" - prediction = predictions.first + if object.is_a? Substance + prediction = predictions[substances.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity return prediction - when "Array" + elsif object.is_a? Array return predictions - when "OpenTox::Dataset" + elsif object.is_a? Dataset # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id - prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.new( + prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) + prediction_dataset = LazarPrediction.create( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, - :prediction_feature_id => prediction_feature.id - + :prediction_feature_id => prediction_feature.id, + :predictions => predictions ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" ) - warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] - prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]} - prediction_dataset.save return prediction_dataset end end - - def training_activities - i = training_dataset.feature_ids.index prediction_feature_id - training_dataset.data_entries.collect{|de| de[i]} + + def training_dataset + Dataset.find(training_dataset_id) + end + + def prediction_feature + Feature.find(prediction_feature_id) + end + + def descriptors + descriptor_ids.collect{|id| Feature.find(id)} + end + + def substances + substance_ids.collect{|id| Substance.find(id)} + end + + def fingerprints? + algorithms[:descriptors][:method] == "fingerprint" ? true : false end end class LazarClassification < Lazar - - def self.create training_dataset, params={} - model = self.new training_dataset, params - model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm - model.neighbor_algorithm ||= "fingerprint_neighbors" - model.neighbor_algorithm_parameters ||= {} - { - :type => "MP2D", - :training_dataset_id => training_dataset.id, - :min_sim => 0.1 - }.each do |key,value| - model.neighbor_algorithm_parameters[key] ||= value - end - model.save - model - end end class LazarRegression < Lazar - - def self.create training_dataset, params={} - model = self.new training_dataset, params - model.neighbor_algorithm ||= "fingerprint_neighbors" - model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" - model.neighbor_algorithm_parameters ||= {} - { - :type => "MP2D", - :training_dataset_id => training_dataset.id, - :min_sim => 0.1 - }.each do |key,value| - model.neighbor_algorithm_parameters[key] ||= value - end - model.save - model - end end - class Prediction + class Validation + include OpenTox include Mongoid::Document include Mongoid::Timestamps - # TODO field Validations field :endpoint, type: String field :species, type: String field :source, type: String @@ -182,7 +344,7 @@ module OpenTox field :repeated_crossvalidation_id, type: BSON::ObjectId def predict object - Lazar.find(model_id).predict object + model.predict object end def training_dataset @@ -193,8 +355,17 @@ module OpenTox Lazar.find model_id end + def algorithms + model.algorithms + end + + def prediction_feature + model.prediction_feature + end + def repeated_crossvalidation - RepeatedCrossValidation.find repeated_crossvalidation_id + # full class name required + OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end def crossvalidations @@ -202,29 +373,51 @@ module OpenTox end def regression? - training_dataset.features.first.numeric? + model.is_a? LazarRegression end def classification? - training_dataset.features.first.nominal? + model.is_a? LazarClassification end def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file - prediction_model = self.new JSON.parse(File.read(metadata_file)) + model_validation = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file - model = nil - if training_dataset.features.first.nominal? - model = LazarClassification.create training_dataset - elsif training_dataset.features.first.numeric? - model = LazarRegression.create training_dataset + model = Lazar.create training_dataset: training_dataset + model_validation[:model_id] = model.id + # full class name required + model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id + model_validation.save + model_validation + end + + def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil + + # find/import training_dataset + training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset # try to import from json dump + Import::Enanomapper.import + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end - prediction_model[:model_id] = model.id - prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id - prediction_model.save - prediction_model + prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first + + model_validation = self.new( + :endpoint => prediction_feature.name, + :source => prediction_feature.source, + :species => "A549 human lung epithelial carcinoma cells", + :unit => prediction_feature.unit + ) + model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms + model_validation[:model_id] = model.id + repeated_cv = Validation::RepeatedCrossValidation.create model + model_validation[:repeated_crossvalidation_id] = repeated_cv.id + model_validation.save + model_validation end + end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb new file mode 100644 index 0000000..02d9a89 --- /dev/null +++ b/lib/nanoparticle.rb @@ -0,0 +1,92 @@ +module OpenTox + + class Nanoparticle < Substance + include OpenTox + + field :core_id, type: String, default: nil + field :coating_ids, type: Array, default: [] + + def core + Compound.find core_id + end + + def coating + coating_ids.collect{|i| Compound.find i } + end + + def fingerprint type=DEFAULT_FINGERPRINT + core_fp = core.fingerprint type + coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact + (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact + end + + def calculate_properties descriptors=PhysChem::OPENBABEL + if core.smiles and !coating.collect{|c| c.smiles}.compact.empty? + core_prop = core.calculate_properties descriptors + coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles} + descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]} + end + end + + def add_feature feature, value, dataset + unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand + case feature.category + when "P-CHEM" + properties[feature.id.to_s] ||= [] + properties[feature.id.to_s] << value + properties[feature.id.to_s].uniq! + when "Proteomics" + properties[feature.id.to_s] ||= [] + properties[feature.id.to_s] << value + properties[feature.id.to_s].uniq! + when "TOX" + dataset.add self, feature, value + else + warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." + end + dataset_ids << dataset.id + dataset_ids.uniq! + end + end + + def parse_ambit_value feature, v, dataset + # TODO add study id to warnings + v.delete "unit" + # TODO: ppm instead of weights + if v.keys == ["textValue"] + add_feature feature, v["textValue"], dataset + elsif v.keys == ["loValue"] + add_feature feature, v["loValue"], dataset + elsif v.keys.size == 2 and v["errorValue"] + add_feature feature, v["loValue"], dataset + #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + elsif v.keys.size == 2 and v["loQualifier"] == "mean" + add_feature feature, v["loValue"], dataset + #warn "'#{feature.name}' is a mean value. Original data is not available." + elsif v.keys.size == 2 and v["loQualifier"] #== ">=" + #warn "Only min value available for '#{feature.name}', entry ignored" + elsif v.keys.size == 2 and v["upQualifier"] #== ">=" + #warn "Only max value available for '#{feature.name}', entry ignored" + elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." + elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." + elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." + elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] + #add_feature feature, [v["loValue"],v["upValue"]].mean, dataset + #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] + #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + add_feature feature, v["loValue"], dataset + elsif v == {} # do nothing + else + warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + end + end + + end +end diff --git a/lib/opentox.rb b/lib/opentox.rb index 186c87a..5c300cf 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -1,8 +1,6 @@ module OpenTox - # Ruby interface - - # create default OpenTox classes (defined in opentox-client.rb) + # create default OpenTox classes # provides Mongoid's query and persistence methods # http://mongoid.org/en/mongoid/docs/persistence.html # http://mongoid.org/en/mongoid/docs/querying.html @@ -13,10 +11,15 @@ module OpenTox include Mongoid::Timestamps store_in collection: klass.downcase.pluralize field :name, type: String + field :source, type: String field :warnings, type: Array, default: [] + + def warn warning + $logger.warn warning + warnings << warning + end end OpenTox.const_set klass,c end end - diff --git a/lib/overwrite.rb b/lib/overwrite.rb index cef5758..31d30c9 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -28,6 +28,11 @@ class Float def signif(n) Float("%.#{n}g" % self) end + + # converts -10 logarithmized values back + def delog10 + 10**(-1*self) + end end module Enumerable @@ -101,19 +106,35 @@ class Array end def mean - self.inject{ |sum, el| sum + el }.to_f / self.size + self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size end def sample_variance m = self.mean - sum = self.inject(0){|accum, i| accum +(i-m)**2 } - sum/(self.length - 1).to_f + sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 } + sum/(self.compact.length - 1).to_f end def standard_deviation Math.sqrt(self.sample_variance) end + def for_R + if self.first.is_a?(String) + #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets + "NA" + else + self.median + end + end + + def collect_with_index + result = [] + self.each_with_index do |elt, idx| + result << yield(elt, idx) + end + result + end end module URI diff --git a/lib/physchem.rb b/lib/physchem.rb index f7b880f..327acd8 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -14,7 +14,7 @@ module OpenTox JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] - OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| name,description = d.split(/\s+/,2) ["Openbabel."+name,description] unless obexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] @@ -25,24 +25,24 @@ module OpenTox prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'') d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] } end - CDKDESCRIPTORS = cdkdescriptors + CDK = cdkdescriptors # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] # strip Joelib messages from stdout - JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| name = d[:java_class].sub(/^joelib2.feature.types./,'') ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] - DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB)) require_relative "unique_descriptors.rb" def self.descriptors desc=DESCRIPTORS desc.collect do |name,description| lib,desc = name.split('.',2) - self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end @@ -54,26 +54,26 @@ module OpenTox CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n| dname = "#{name}.#{n}" description = DESCRIPTORS[dname] - udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end else description = DESCRIPTORS[name] - udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end udesc end def self.openbabel_descriptors - descriptors OBDESCRIPTORS + descriptors OPENBABEL end def self.cdk_descriptors - descriptors CDKDESCRIPTORS + descriptors CDK end def self.joelib_descriptors - descriptors JOELIBDESCRIPTORS + descriptors JOELIB end def calculate compound @@ -131,3 +131,4 @@ module OpenTox end end +OpenTox::PhysChem.descriptors # load descriptor features diff --git a/lib/regression.rb b/lib/regression.rb index 5021fb3..3890987 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,148 +3,18 @@ module OpenTox class Regression - def self.local_weighted_average compound, params + def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil + # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 - neighbors = params[:neighbors] - neighbors.each do |row| - sim = row["tanimoto"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) - sim_sum += sim - end - end - end - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + dependent_variables.each_with_index do |v,i| + weighted_sum += weights[i]*dependent_variables[i] + sim_sum += weights[i] + end if dependent_variables + sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - # TODO explicit neighbors, also for physchem - def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" - neighbors = params[:neighbors] - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - activities = [] - fingerprints = {} - weights = [] - fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort - - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - fingerprint = neighbor.fingerprint - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] - fingerprint_ids.each_with_index do |id,j| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) - end - end - end - end - - variables = [] - data_frame = [activities] - fingerprints.each do |k,v| - unless v.uniq.size == 1 - data_frame << v.collect{|m| m ? "T" : "F"} - variables << k - end - end - - if variables.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result - - else - compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction method, data_frame, variables, weights, compound_features - if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction - else - prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] - prediction[:value] = 10**prediction[:value] - prediction[:rmse] = 10**prediction[:rmse] - prediction - end - end - - end - - def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" - - neighbors = params[:neighbors] - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 - - activities = [] - weights = [] - physchem = {} - - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] # TODO cosine ? - neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity - physchem[pid] ||= [] - physchem[pid] << v - end - end - end - end - - # remove properties with a single value - physchem.each do |pid,v| - physchem.delete(pid) if v.uniq.size <= 1 - end - - if physchem.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result - - else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} - if prediction.nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction - else - prediction[:value] = 10**prediction[:value] - prediction - end - end - - end - - def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values - R.assign "weights", training_weights - r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" - R.eval "data <- #{r_data_frame}" - R.assign "features", training_features - R.eval "names(data) <- append(c('activities'),features)" # - begin - R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" - rescue - return nil - end - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, - } - end - end end end diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb index 9321a75..2073be2 100644 --- a/lib/rest-client-wrapper.rb +++ b/lib/rest-client-wrapper.rb @@ -55,14 +55,8 @@ module OpenTox if [301, 302, 307].include? response.code and request.method == :get response.follow_redirection(request, result) elsif response.code >= 400 and !URI.task?(uri) - #TODO add parameters to error-report - #parameters = request.args - #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid] - #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url] - #message += "\nREST parameters:\n#{parameters.inspect}" error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first begin # errors are returned as error reports in json, try to parse - # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task content = JSON.parse(response) msg = content["message"].to_s cause = content["errorCause"].to_s diff --git a/lib/similarity.rb b/lib/similarity.rb new file mode 100644 index 0000000..0901936 --- /dev/null +++ b/lib/similarity.rb @@ -0,0 +1,65 @@ +module OpenTox + module Algorithm + + class Vector + def self.dot_product(a, b) + products = a.zip(b).map{|a, b| a * b} + products.inject(0) {|s,p| s + p} + end + + def self.magnitude(point) + squares = point.map{|x| x ** 2} + Math.sqrt(squares.inject(0) {|s, c| s + c}) + end + end + + class Similarity + + def self.tanimoto fingerprints + ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f + end + + #def self.weighted_tanimoto fingerprints + #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f + #end + + def self.euclid scaled_properties + sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} + Math.sqrt(sq.inject(0) {|s,c| s + c}) + end + + # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + def self.cosine scaled_properties + scaled_properties = remove_nils scaled_properties + Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1])) + end + + def self.weighted_cosine scaled_properties # [a,b,weights] + a,b,w = remove_nils scaled_properties + return cosine(scaled_properties) if w.uniq.size == 1 + dot_product = 0 + magnitude_a = 0 + magnitude_b = 0 + (0..a.size-1).each do |i| + dot_product += w[i].abs*a[i]*b[i] + magnitude_a += w[i].abs*a[i]**2 + magnitude_b += w[i].abs*b[i]**2 + end + dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) + end + + def self.remove_nils scaled_properties + a =[]; b = []; w = [] + (0..scaled_properties.first.size-1).each do |i| + if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan? + a << scaled_properties[0][i] + b << scaled_properties[1][i] + w << scaled_properties[2][i] + end + end + [a,b,w] + end + + end + end +end diff --git a/lib/substance.rb b/lib/substance.rb new file mode 100644 index 0000000..31c465e --- /dev/null +++ b/lib/substance.rb @@ -0,0 +1,8 @@ +module OpenTox + + class Substance + field :properties, type: Hash, default: {} + field :dataset_ids, type: Array, default: [] + end + +end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb new file mode 100644 index 0000000..71abad2 --- /dev/null +++ b/lib/train-test-validation.rb @@ -0,0 +1,69 @@ +module OpenTox + + module Validation + + class TrainTest < Validation + + field :training_dataset_id, type: BSON::ObjectId + field :test_dataset_id, type: BSON::ObjectId + + def self.create model, training_set, test_set + + validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms + validation_model.save + predictions = validation_model.predict test_set.substances + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) + else + nr_unpredicted += 1 + end + end + predictions.select!{|cid,p| p[:value] and p[:measurements]} + validation = self.new( + :model_id => validation_model.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.substances.size, + :nr_unpredicted => nr_unpredicted, + :predictions => predictions + ) + validation.save + validation + end + + def test_dataset + Dataset.find test_dataset_id + end + + def training_dataset + Dataset.find training_dataset_id + end + + end + + class ClassificationTrainTest < TrainTest + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :probability_plot_id, type: BSON::ObjectId + end + + class RegressionTrainTest < TrainTest + include RegressionStatistics + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 + field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 + field :correlation_plot_id, type: BSON::ObjectId + end + + end + +end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb new file mode 100644 index 0000000..b6f8a60 --- /dev/null +++ b/lib/validation-statistics.rb @@ -0,0 +1,225 @@ +module OpenTox + module Validation + module ClassificationStatistics + + def statistics + self.accept_values = model.prediction_feature.accept_values + self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + nr_instances = 0 + predictions.each do |cid,pred| + # TODO + # use predictions without probabilities (single neighbor)?? + # use measured majority class?? + if pred[:measurements].uniq.size == 1 and pred[:probabilities] + m = pred[:measurements].first + if pred[:value] == m + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end + elsif pred[:value] != m + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end + end + end + end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f + self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + $logger.debug "Accuracy #{accuracy}" + save + { + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :accuracy => accuracy, + :weighted_accuracy => weighted_accuracy, + :true_rate => true_rate, + :predictivity => predictivity, + } + end + + def probability_plot format: "pdf" + #unless probability_plot_id + + #tmpdir = File.join(ENV["HOME"], "tmp") + tmpdir = "/tmp" + #p tmpdir + FileUtils.mkdir_p tmpdir + tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}") + accuracies = [] + probabilities = [] + correct_predictions = 0 + incorrect_predictions = 0 + pp = [] + predictions.values.select{|p| p["probabilities"]}.compact.each do |p| + p["measurements"].each do |m| + pp << [ p["probabilities"][p["value"]], p["value"] == m ] + end + end + pp.sort_by!{|p| 1-p.first} + pp.each do |p| + p[1] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + probabilities << p[0] + end + R.assign "accuracy", accuracies + R.assign "probability", probabilities + R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:probability_plot_id => plot_id) + #end + $gridfs.find_one(_id: probability_plot_id).data + end + end + + module RegressionStatistics + + def statistics + self.rmse = 0 + self.mae = 0 + self.within_prediction_interval = 0 + self.out_of_prediction_interval = 0 + x = [] + y = [] + predictions.each do |cid,pred| + if pred[:value] and pred[:measurements] + x << pred[:measurements].median + y << pred[:value] + error = pred[:value]-pred[:measurements].median + self.rmse += error**2 + self.mae += error.abs + if pred[:prediction_interval] + if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] + self.within_prediction_interval += 1 + else + self.out_of_prediction_interval += 1 + end + end + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(measurement,prediction,use='pairwise')" + self.r_squared = R.eval("r").to_ruby**2 + self.mae = self.mae/predictions.size + self.rmse = Math.sqrt(self.rmse/predictions.size) + $logger.debug "R^2 #{r_squared}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval" + save + { + :mae => mae, + :rmse => rmse, + :r_squared => r_squared, + :within_prediction_interval => within_prediction_interval, + :out_of_prediction_interval => out_of_prediction_interval, + } + end + + def percent_within_prediction_interval + 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) + end + + def correlation_plot format: "png" + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + x = [] + y = [] + feature = Feature.find(predictions.first.last["prediction_feature_id"]) + predictions.each do |sid,p| + x << p["measurements"].median + y << p["value"] + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end + + def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false + worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n] + worst_predictions.collect do |p| + substance = Substance.find(p.first) + prediction = p[1] + if show_neigbors + neighbors = prediction["neighbors"].collect do |n| + common_descriptors = [] + if show_common_descriptors + common_descriptors = n["common_descriptors"].collect do |d| + f=Feature.find(d) + { + :id => f.id.to_s, + :name => "#{f.name} (#{f.conditions})", + :p_value => d[:p_value], + :r_squared => d[:r_squared], + } + end + else + common_descriptors = n["common_descriptors"].size + end + { + :name => Substance.find(n["_id"]).name, + :id => n["_id"].to_s, + :common_descriptors => common_descriptors + } + end + else + neighbors = prediction["neighbors"].size + end + { + :id => substance.id.to_s, + :name => substance.name, + :feature => Feature.find(prediction["prediction_feature_id"]).name, + :error => (prediction["value"] - prediction["measurements"].median).abs, + :prediction => prediction["value"], + :measurements => prediction["measurements"], + :neighbors => neighbors + } + end + end + end + end +end diff --git a/lib/validation.rb b/lib/validation.rb index b72d273..ced9596 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -1,108 +1,25 @@ module OpenTox - class Validation - - field :model_id, type: BSON::ObjectId - field :prediction_dataset_id, type: BSON::ObjectId - field :crossvalidation_id, type: BSON::ObjectId - field :test_dataset_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Array - - def prediction_dataset - Dataset.find prediction_dataset_id - end - - def test_dataset - Dataset.find test_dataset_id - end - - def model - Model::Lazar.find model_id - end - - def self.create model, training_set, test_set, crossvalidation=nil - - atts = model.attributes.dup # do not modify attributes from original model - atts["_id"] = BSON::ObjectId.new - atts[:training_dataset_id] = training_set.id - validation_model = model.class.create training_set, atts - validation_model.save - cids = test_set.compound_ids - - test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used - prediction_dataset = validation_model.predict test_set_without_activities - predictions = [] - nr_unpredicted = 0 - activities = test_set.data_entries.collect{|de| de.first} - prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] #and de[1] - cid = prediction_dataset.compound_ids[i] - rows = cids.each_index.select{|r| cids[r] == cid } - activities = rows.collect{|r| test_set.data_entries[r][0]} - prediction = de.first - confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] - else - nr_unpredicted += 1 - end + module Validation + + class Validation + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "validations" + field :name, type: String + field :model_id, type: BSON::ObjectId + field :nr_instances, type: Integer, default: 0 + field :nr_unpredicted, type: Integer, default: 0 + field :predictions, type: Hash, default: {} + field :finished_at, type: Time + + def model + Model::Lazar.find model_id end - validation = self.new( - :model_id => validation_model.id, - :prediction_dataset_id => prediction_dataset.id, - :test_dataset_id => test_set.id, - :nr_instances => test_set.compound_ids.size, - :nr_unpredicted => nr_unpredicted, - :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence - ) - validation.crossvalidation_id = crossvalidation.id if crossvalidation - validation.save - validation - end - - end - - class ClassificationValidation < Validation - end - class RegressionValidation < Validation - - def statistics - rmse = 0 - weighted_rmse = 0 - rse = 0 - weighted_rse = 0 - mae = 0 - weighted_mae = 0 - confidence_sum = 0 - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - confidence_sum += confidence - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - x = predictions.collect{|p| p[1].median} - y = predictions.collect{|p| p[2]} - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } end + end end |