From d5bf97c2cb999539c56bf59aa1d7d3286745be84 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 23 Sep 2015 14:51:41 +0200 Subject: validations fixed (all models were executed with default parameters) --- lib/compound.rb | 41 +++++++++++- lib/crossvalidation.rb | 7 +- lib/dataset.rb | 40 ++++++++++++ lib/descriptor.rb | 3 +- lib/experiment.rb | 162 +++++++++++++++++++++------------------------- lib/model.rb | 88 ++++++++++++++++--------- lib/unique_descriptors.rb | 4 +- lib/validation.rb | 12 +++- 8 files changed, 227 insertions(+), 130 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 7abd913..d3df125 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -44,6 +44,21 @@ module OpenTox compound.save compound end + + #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format + def mpd + smarts = obconversion(smiles,"smi","mpd").strip.split("\t") + smarts.shift # remove Title + smarts + + end + + #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html + def mna level=2 + smarts = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") + smarts.shift # remove Title + smarts + end def openbabel_fingerprint type="FP2" unless self.send(type.downcase.to_sym) # stored fingerprint @@ -72,7 +87,7 @@ module OpenTox end start += bitsperint end - update type.downcase.to_sym, bits_set + update_attribute type.downcase.to_sym, bits_set end self.send(type.downcase.to_sym) end @@ -242,6 +257,28 @@ module OpenTox neighbors end + def physchem_neighbors params + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] + neighbors = [] + feature_dataset.data_entries.each_with_index do |fingerprint, i| + # TODO implement pearson and cosine similarity separatly + R.assign "x", query_fingerprint + R.assign "y", fingerprint + # pearson r + #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby + #p "pearson" + #p sim + #p "cosine" + sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first + #p sim + if sim >= params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + def neighbors threshold=0.7 # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb @@ -308,7 +345,7 @@ print sdf end def obconversion(identifier,input_format,output_format,option=nil) - self.class.obconversion(identifier,input_format,output_format,option=nil) + self.class.obconversion(identifier,input_format,output_format,option) end end end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 337b434..4c80344 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -33,15 +33,12 @@ module OpenTox nr_instances = 0 nr_unpredicted = 0 predictions = [] - validation_class = Object.const_get(self.to_s.sub(/Cross/,'')) training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| fork do # parallel execution of validations $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now - #p validation_class#.create(model, fold[0], fold[1],cv) - validation = validation_class.create(model, fold[0], fold[1],cv) - #p validation + validation = Validation.create(model, fold[0], fold[1],cv) $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" end end @@ -170,7 +167,7 @@ module OpenTox y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction))" + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" r = R.eval("r").to_ruby mae = mae/predictions.size diff --git a/lib/dataset.rb b/lib/dataset.rb index 946fd90..7c8ab44 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -46,6 +46,12 @@ module OpenTox else @data_entries = Marshal.load(data_entry_file.data) bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array + unless @data_entries.first.size == feature_ids.size + # TODO: fix (unknown) source of empty data_entries + sleep 1 + data_entry_file = $gridfs.find_one(_id: data_entries_id) + @data_entries = Marshal.load(data_entry_file.data) + end bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size # TODO: data_entries can be empty, poorly reproducible, mongo problem? bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size @@ -281,6 +287,29 @@ module OpenTox end end end + + def scale + scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)} + centers = [] + scales = [] + feature_ids.each_with_index do |feature_id,col| + R.assign "x", data_entries.collect{|de| de[col]} + R.eval "scaled = scale(x,center=T,scale=T)" + centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby + scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby + R.eval("scaled").to_ruby.each_with_index do |value,row| + scaled_data_entries[row][col] = value + end + end + scaled_dataset = ScaledDataset.new(attributes) + scaled_dataset["_id"] = BSON::ObjectId.new + scaled_dataset["_type"] = "OpenTox::ScaledDataset" + scaled_dataset.centers = centers + scaled_dataset.scales = scales + scaled_dataset.data_entries = scaled_data_entries + scaled_dataset.save_all + scaled_dataset + end end # Dataset for lazar predictions @@ -297,6 +326,17 @@ module OpenTox # Dataset for descriptors (physchem) class DescriptorDataset < Dataset field :feature_calculation_algorithm, type: String + + end + + class ScaledDataset < DescriptorDataset + + field :centers, type: Array, default: [] + field :scales, type: Array, default: [] + + def original_value value, i + value * scales[i] + centers[i] + end end # Dataset for fminer descriptors diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 5ae0ef2..9733bde 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -16,7 +16,7 @@ module OpenTox LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") - obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"] + obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| name,description = d.split(/\s+/,2) ["Openbabel."+name,description] unless obexclude.include? name @@ -107,6 +107,7 @@ module OpenTox des[lib] << descriptor end des.each do |lib,descriptors| + p lib, descriptors send(lib, descriptors) end serialize diff --git a/lib/experiment.rb b/lib/experiment.rb index 0a76c53..616a273 100644 --- a/lib/experiment.rb +++ b/lib/experiment.rb @@ -4,105 +4,93 @@ module OpenTox field :dataset_ids, type: Array field :model_settings, type: Array, default: [] field :results, type: Hash, default: {} - end - def run - dataset_ids.each do |dataset_id| - dataset = Dataset.find(dataset_id) - results[dataset_id.to_s] = [] - model_settings.each do |setting| - model = Object.const_get(setting[:algorithm]).create dataset - model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm] - model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm] - model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter] - model.save - repeated_crossvalidation = RepeatedCrossValidation.create model - results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} + def run + dataset_ids.each do |dataset_id| + dataset = Dataset.find(dataset_id) + results[dataset_id.to_s] = [] + model_settings.each do |setting| + model_algorithm = setting.delete :model_algorithm + model = Object.const_get(model_algorithm).create dataset, setting + #model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm] + #model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm] + #model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter] + p model + model.save + repeated_crossvalidation = RepeatedCrossValidation.create model + results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} + end end + save end - save - end - - def self.create params - experiment = self.new - $logge.debug "Experiment started ..." - #experiment.run params - experiment - end - def report - # TODO significances - # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/ - report = {} - report[:name] = name - report[:experiment_id] = self.id.to_s - report[:results] = {} - parameters = [] - dataset_ids.each do |dataset_id| - dataset_name = Dataset.find(dataset_id).name - report[:results][dataset_name] = {} - report[:results][dataset_name][:anova] = {} - report[:results][dataset_name][:data] = [] - results[dataset_id.to_s].each do |result| - model = Model::Lazar.find(result[:model_id]) - repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) - crossvalidations = repeated_cv.crossvalidations - if crossvalidations.first.is_a? ClassificationCrossValidation - parameters = [:accuracy,:true_rate,:predictivity] - elsif crossvalidations.first.is_a? RegressionCrossValidation - parameters = [:rmse,:mae,:r_squared] - end - summary = {} - [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| - summary[key] = model[key] - end - summary[:nr_instances] = crossvalidations.first.nr_instances - summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} - summary[:time] = crossvalidations.collect{|cv| cv.time} - parameters.each do |param| - summary[param] = crossvalidations.collect{|cv| cv.send(param)} + def report + # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/ + report = {} + report[:name] = name + report[:experiment_id] = self.id.to_s + report[:results] = {} + parameters = [] + dataset_ids.each do |dataset_id| + dataset_name = Dataset.find(dataset_id).name + report[:results][dataset_name] = {} + report[:results][dataset_name][:anova] = {} + report[:results][dataset_name][:data] = [] + results[dataset_id.to_s].each do |result| + model = Model::Lazar.find(result[:model_id]) + repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) + crossvalidations = repeated_cv.crossvalidations + if crossvalidations.first.is_a? ClassificationCrossValidation + parameters = [:accuracy,:true_rate,:predictivity] + elsif crossvalidations.first.is_a? RegressionCrossValidation + parameters = [:rmse,:mae,:r_squared] + end + summary = {} + [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| + summary[key] = model[key] + end + summary[:nr_instances] = crossvalidations.first.nr_instances + summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} + summary[:time] = crossvalidations.collect{|cv| cv.time} + parameters.each do |param| + summary[param] = crossvalidations.collect{|cv| cv.send(param)} + end + report[:results][dataset_name][:data] << summary end - report[:results][dataset_name][:data] << summary end - end - report[:results].each do |dataset,results| - ([:time,:nr_unpredicted]+parameters).each do |param| - experiments = [] - outcome = [] - results[:data].each_with_index do |result,i| - result[param].each do |p| - experiments << i - outcome << p + report[:results].each do |dataset,results| + ([:time,:nr_unpredicted]+parameters).each do |param| + experiments = [] + outcome = [] + results[:data].each_with_index do |result,i| + result[param].each do |p| + experiments << i + p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0 + outcome << p + end end - end - R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"} - R.eval "experiment_nr = factor(experiment_nr)" - R.assign "outcome",outcome - R.eval "data = data.frame(experiment_nr,outcome)" - # one-way ANOVA - R.eval "fit = aov(outcome ~ experiment_nr, data=data)" - # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov - p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby - # aequivalent - # sum = R.eval("summary(fit)") - #p_value = sum.to_ruby.first.last.first + R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"} + R.eval "experiment_nr = factor(experiment_nr)" + R.assign "outcome", outcome + R.eval "data = data.frame(experiment_nr,outcome)" + # one-way ANOVA + R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')" + # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov + p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby + # aequivalent + # sum = R.eval("summary(fit)") + #p_value = sum.to_ruby.first.last.first + report[:results][dataset][:anova][param] = p_value =begin - if p_value < 0.01 - p_value = "#{p_value} ***" - elsif p_value < 0.05 - p_value = "#{p_value} **" - elsif p_value < 0.1 - p_value = "#{p_value} *" - end =end - report[:results][dataset][:anova][param] = p_value + end end + report end - report - end - def summary - report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}} + def summary + report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}} + end end end diff --git a/lib/model.rb b/lib/model.rb index 9892f64..817a61e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -26,25 +26,26 @@ module OpenTox # algorithms field :neighbor_algorithm, type: String - field :neighbor_algorithm_parameters, type: Hash + field :neighbor_algorithm_parameters, type: Hash, default: {} # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model - def self.create training_dataset + def initialize training_dataset, params={} + super params bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 # TODO document convention prediction_feature = training_dataset.features.first - prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new - lazar.training_dataset_id = training_dataset.id - lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id - lazar.prediction_feature_id = prediction_feature.id - lazar.name = "#{training_dataset.name} #{prediction_feature.name}" - - lazar.save - lazar + # set defaults for empty parameters + self.prediction_feature_id ||= prediction_feature.id + self.training_dataset_id ||= training_dataset.id + self.name ||= "#{training_dataset.name} #{prediction_feature.name}" + self.neighbor_algorithm_parameters ||= {} + self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + save + self end def predict object @@ -80,6 +81,7 @@ module OpenTox next end neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) + #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) # add activities # TODO: improve efficiency, takes 3 times longer than previous version @@ -90,6 +92,17 @@ module OpenTox end neighbors.compact! # remove neighbors without training activities predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size}) +=begin +# TODO scaled dataset for physchem + p neighbor_algorithm_parameters + p (neighbor_algorithm_parameters["feature_dataset_id"]) + d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]) + p d + p d.class + if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset + p "SCALED" + end +=end end # serialize result @@ -128,15 +141,40 @@ module OpenTox end class LazarClassification < Lazar - def initialize - super - self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - self.neighbor_algorithm = "fingerprint_neighbors" - self.neighbor_algorithm_parameters = { + + def self.create training_dataset, params={} + model = self.new training_dataset, params + model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm + model.neighbor_algorithm |= "fingerprint_neighbors" + model.neighbor_algorithm_parameters ||= {} + { :type => "FP4", - :training_dataset_id => training_dataset_id, + :training_dataset_id => training_dataset.id, :min_sim => 0.7 - } + }.each do |key,value| + model.neighbor_algorithm_parameters[key] ||= value + end + model.save + model + end + end + + class LazarRegression < Lazar + + def self.create training_dataset, params={} + model = self.new training_dataset, params + #model.neighbor_algorithm ||= "fingerprint_neighbors" + #model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" + #model.neighbor_algorithm_parameters ||= {} + #{ + #:type => "FP4", + #:training_dataset_id => training_dataset.id, + #:min_sim => 0.7 + #}.each do |key,value| + #model.neighbor_algorithm_parameters[key] ||= value + #end + model.save + model end end @@ -159,26 +197,12 @@ module OpenTox end end - class LazarRegression < Lazar - - def initialize - super - #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" - self.neighbor_algorithm = "fingerprint_neighbors" - self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" - self.neighbor_algorithm_parameters = { - :type => "FP4", - :training_dataset_id => self.training_dataset_id, - :min_sim => 0.7 - } - end - end - class Prediction include OpenTox include Mongoid::Document include Mongoid::Timestamps + # TODO cv -> repeated cv # TODO field Validations field :endpoint, type: String field :species, type: String diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb index 676f34a..cf9cbf3 100644 --- a/lib/unique_descriptors.rb +++ b/lib/unique_descriptors.rb @@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib) "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib) "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib) - "Openbabel.L5", #Lipinski Rule of Five + #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! "Openbabel.logP", #octanol/water partition coefficient "Openbabel.MP", #Melting point "Openbabel.MR", #molar refractivity @@ -56,7 +56,7 @@ UNIQUEDESCRIPTORS = [ "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth. "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O - "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms . + #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms . "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration. "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule. "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule. diff --git a/lib/validation.rb b/lib/validation.rb index 63fbd89..9eebef8 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -2,6 +2,7 @@ module OpenTox class Validation + field :model_id, type: BSON::ObjectId field :prediction_dataset_id, type: BSON::ObjectId field :crossvalidation_id, type: BSON::ObjectId field :test_dataset_id, type: BSON::ObjectId @@ -17,9 +18,17 @@ module OpenTox Dataset.find test_dataset_id end + def model + Model::Lazar.find model_id + end + def self.create model, training_set, test_set, crossvalidation=nil - validation_model = model.class.create training_set#, features + atts = model.attributes.dup # do not modify attributes from original model + atts["_id"] = BSON::ObjectId.new + atts[:training_dataset_id] = training_set.id + validation_model = model.class.create training_set, atts + validation_model.save test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities predictions = [] @@ -36,6 +45,7 @@ module OpenTox end end validation = self.new( + :model_id => validation_model.id, :prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, :nr_instances => test_set.compound_ids.size, -- cgit v1.2.3