From d5bf97c2cb999539c56bf59aa1d7d3286745be84 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 23 Sep 2015 14:51:41 +0200 Subject: validations fixed (all models were executed with default parameters) --- lib/compound.rb | 41 ++++++++++- lib/crossvalidation.rb | 7 +- lib/dataset.rb | 40 +++++++++++ lib/descriptor.rb | 3 +- lib/experiment.rb | 162 ++++++++++++++++++++----------------------- lib/model.rb | 88 ++++++++++++++--------- lib/unique_descriptors.rb | 4 +- lib/validation.rb | 12 +++- test/compound.rb | 12 ++++ test/dataset.rb | 10 +++ test/experiment.rb | 63 +++++++++++++++-- test/lazar-physchem-short.rb | 34 ++++----- test/lazar-regression.rb | 10 +++ test/validation.rb | 66 ++++++++++++++++-- 14 files changed, 394 insertions(+), 158 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 7abd913..d3df125 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -44,6 +44,21 @@ module OpenTox compound.save compound end + + #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format + def mpd + smarts = obconversion(smiles,"smi","mpd").strip.split("\t") + smarts.shift # remove Title + smarts + + end + + #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html + def mna level=2 + smarts = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") + smarts.shift # remove Title + smarts + end def openbabel_fingerprint type="FP2" unless self.send(type.downcase.to_sym) # stored fingerprint @@ -72,7 +87,7 @@ module OpenTox end start += bitsperint end - update type.downcase.to_sym, bits_set + update_attribute type.downcase.to_sym, bits_set end self.send(type.downcase.to_sym) end @@ -242,6 +257,28 @@ module OpenTox neighbors end + def physchem_neighbors params + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] + neighbors = [] + feature_dataset.data_entries.each_with_index do |fingerprint, i| + # TODO implement pearson and cosine similarity separatly + R.assign "x", query_fingerprint + R.assign "y", fingerprint + # pearson r + #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby + #p "pearson" + #p sim + #p "cosine" + sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first + #p sim + if sim >= params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + def neighbors threshold=0.7 # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb @@ -308,7 +345,7 @@ print sdf end def obconversion(identifier,input_format,output_format,option=nil) - self.class.obconversion(identifier,input_format,output_format,option=nil) + self.class.obconversion(identifier,input_format,output_format,option) end end end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 337b434..4c80344 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -33,15 +33,12 @@ module OpenTox nr_instances = 0 nr_unpredicted = 0 predictions = [] - validation_class = Object.const_get(self.to_s.sub(/Cross/,'')) training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| fork do # parallel execution of validations $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now - #p validation_class#.create(model, fold[0], fold[1],cv) - validation = validation_class.create(model, fold[0], fold[1],cv) - #p validation + validation = Validation.create(model, fold[0], fold[1],cv) $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" end end @@ -170,7 +167,7 @@ module OpenTox y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction))" + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" r = R.eval("r").to_ruby mae = mae/predictions.size diff --git a/lib/dataset.rb b/lib/dataset.rb index 946fd90..7c8ab44 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -46,6 +46,12 @@ module OpenTox else @data_entries = Marshal.load(data_entry_file.data) bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array + unless @data_entries.first.size == feature_ids.size + # TODO: fix (unknown) source of empty data_entries + sleep 1 + data_entry_file = $gridfs.find_one(_id: data_entries_id) + @data_entries = Marshal.load(data_entry_file.data) + end bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size # TODO: data_entries can be empty, poorly reproducible, mongo problem? bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size @@ -281,6 +287,29 @@ module OpenTox end end end + + def scale + scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)} + centers = [] + scales = [] + feature_ids.each_with_index do |feature_id,col| + R.assign "x", data_entries.collect{|de| de[col]} + R.eval "scaled = scale(x,center=T,scale=T)" + centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby + scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby + R.eval("scaled").to_ruby.each_with_index do |value,row| + scaled_data_entries[row][col] = value + end + end + scaled_dataset = ScaledDataset.new(attributes) + scaled_dataset["_id"] = BSON::ObjectId.new + scaled_dataset["_type"] = "OpenTox::ScaledDataset" + scaled_dataset.centers = centers + scaled_dataset.scales = scales + scaled_dataset.data_entries = scaled_data_entries + scaled_dataset.save_all + scaled_dataset + end end # Dataset for lazar predictions @@ -297,6 +326,17 @@ module OpenTox # Dataset for descriptors (physchem) class DescriptorDataset < Dataset field :feature_calculation_algorithm, type: String + + end + + class ScaledDataset < DescriptorDataset + + field :centers, type: Array, default: [] + field :scales, type: Array, default: [] + + def original_value value, i + value * scales[i] + centers[i] + end end # Dataset for fminer descriptors diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 5ae0ef2..9733bde 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -16,7 +16,7 @@ module OpenTox LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") - obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"] + obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| name,description = d.split(/\s+/,2) ["Openbabel."+name,description] unless obexclude.include? name @@ -107,6 +107,7 @@ module OpenTox des[lib] << descriptor end des.each do |lib,descriptors| + p lib, descriptors send(lib, descriptors) end serialize diff --git a/lib/experiment.rb b/lib/experiment.rb index 0a76c53..616a273 100644 --- a/lib/experiment.rb +++ b/lib/experiment.rb @@ -4,105 +4,93 @@ module OpenTox field :dataset_ids, type: Array field :model_settings, type: Array, default: [] field :results, type: Hash, default: {} - end - def run - dataset_ids.each do |dataset_id| - dataset = Dataset.find(dataset_id) - results[dataset_id.to_s] = [] - model_settings.each do |setting| - model = Object.const_get(setting[:algorithm]).create dataset - model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm] - model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm] - model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter] - model.save - repeated_crossvalidation = RepeatedCrossValidation.create model - results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} + def run + dataset_ids.each do |dataset_id| + dataset = Dataset.find(dataset_id) + results[dataset_id.to_s] = [] + model_settings.each do |setting| + model_algorithm = setting.delete :model_algorithm + model = Object.const_get(model_algorithm).create dataset, setting + #model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm] + #model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm] + #model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter] + p model + model.save + repeated_crossvalidation = RepeatedCrossValidation.create model + results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} + end end + save end - save - end - - def self.create params - experiment = self.new - $logge.debug "Experiment started ..." - #experiment.run params - experiment - end - def report - # TODO significances - # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/ - report = {} - report[:name] = name - report[:experiment_id] = self.id.to_s - report[:results] = {} - parameters = [] - dataset_ids.each do |dataset_id| - dataset_name = Dataset.find(dataset_id).name - report[:results][dataset_name] = {} - report[:results][dataset_name][:anova] = {} - report[:results][dataset_name][:data] = [] - results[dataset_id.to_s].each do |result| - model = Model::Lazar.find(result[:model_id]) - repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) - crossvalidations = repeated_cv.crossvalidations - if crossvalidations.first.is_a? ClassificationCrossValidation - parameters = [:accuracy,:true_rate,:predictivity] - elsif crossvalidations.first.is_a? RegressionCrossValidation - parameters = [:rmse,:mae,:r_squared] - end - summary = {} - [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| - summary[key] = model[key] - end - summary[:nr_instances] = crossvalidations.first.nr_instances - summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} - summary[:time] = crossvalidations.collect{|cv| cv.time} - parameters.each do |param| - summary[param] = crossvalidations.collect{|cv| cv.send(param)} + def report + # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/ + report = {} + report[:name] = name + report[:experiment_id] = self.id.to_s + report[:results] = {} + parameters = [] + dataset_ids.each do |dataset_id| + dataset_name = Dataset.find(dataset_id).name + report[:results][dataset_name] = {} + report[:results][dataset_name][:anova] = {} + report[:results][dataset_name][:data] = [] + results[dataset_id.to_s].each do |result| + model = Model::Lazar.find(result[:model_id]) + repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) + crossvalidations = repeated_cv.crossvalidations + if crossvalidations.first.is_a? ClassificationCrossValidation + parameters = [:accuracy,:true_rate,:predictivity] + elsif crossvalidations.first.is_a? RegressionCrossValidation + parameters = [:rmse,:mae,:r_squared] + end + summary = {} + [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| + summary[key] = model[key] + end + summary[:nr_instances] = crossvalidations.first.nr_instances + summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} + summary[:time] = crossvalidations.collect{|cv| cv.time} + parameters.each do |param| + summary[param] = crossvalidations.collect{|cv| cv.send(param)} + end + report[:results][dataset_name][:data] << summary end - report[:results][dataset_name][:data] << summary end - end - report[:results].each do |dataset,results| - ([:time,:nr_unpredicted]+parameters).each do |param| - experiments = [] - outcome = [] - results[:data].each_with_index do |result,i| - result[param].each do |p| - experiments << i - outcome << p + report[:results].each do |dataset,results| + ([:time,:nr_unpredicted]+parameters).each do |param| + experiments = [] + outcome = [] + results[:data].each_with_index do |result,i| + result[param].each do |p| + experiments << i + p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0 + outcome << p + end end - end - R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"} - R.eval "experiment_nr = factor(experiment_nr)" - R.assign "outcome",outcome - R.eval "data = data.frame(experiment_nr,outcome)" - # one-way ANOVA - R.eval "fit = aov(outcome ~ experiment_nr, data=data)" - # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov - p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby - # aequivalent - # sum = R.eval("summary(fit)") - #p_value = sum.to_ruby.first.last.first + R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"} + R.eval "experiment_nr = factor(experiment_nr)" + R.assign "outcome", outcome + R.eval "data = data.frame(experiment_nr,outcome)" + # one-way ANOVA + R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')" + # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov + p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby + # aequivalent + # sum = R.eval("summary(fit)") + #p_value = sum.to_ruby.first.last.first + report[:results][dataset][:anova][param] = p_value =begin - if p_value < 0.01 - p_value = "#{p_value} ***" - elsif p_value < 0.05 - p_value = "#{p_value} **" - elsif p_value < 0.1 - p_value = "#{p_value} *" - end =end - report[:results][dataset][:anova][param] = p_value + end end + report end - report - end - def summary - report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}} + def summary + report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}} + end end end diff --git a/lib/model.rb b/lib/model.rb index 9892f64..817a61e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -26,25 +26,26 @@ module OpenTox # algorithms field :neighbor_algorithm, type: String - field :neighbor_algorithm_parameters, type: Hash + field :neighbor_algorithm_parameters, type: Hash, default: {} # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model - def self.create training_dataset + def initialize training_dataset, params={} + super params bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 # TODO document convention prediction_feature = training_dataset.features.first - prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new - lazar.training_dataset_id = training_dataset.id - lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id - lazar.prediction_feature_id = prediction_feature.id - lazar.name = "#{training_dataset.name} #{prediction_feature.name}" - - lazar.save - lazar + # set defaults for empty parameters + self.prediction_feature_id ||= prediction_feature.id + self.training_dataset_id ||= training_dataset.id + self.name ||= "#{training_dataset.name} #{prediction_feature.name}" + self.neighbor_algorithm_parameters ||= {} + self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + save + self end def predict object @@ -80,6 +81,7 @@ module OpenTox next end neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) + #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) # add activities # TODO: improve efficiency, takes 3 times longer than previous version @@ -90,6 +92,17 @@ module OpenTox end neighbors.compact! # remove neighbors without training activities predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size}) +=begin +# TODO scaled dataset for physchem + p neighbor_algorithm_parameters + p (neighbor_algorithm_parameters["feature_dataset_id"]) + d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]) + p d + p d.class + if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset + p "SCALED" + end +=end end # serialize result @@ -128,15 +141,40 @@ module OpenTox end class LazarClassification < Lazar - def initialize - super - self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - self.neighbor_algorithm = "fingerprint_neighbors" - self.neighbor_algorithm_parameters = { + + def self.create training_dataset, params={} + model = self.new training_dataset, params + model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm + model.neighbor_algorithm |= "fingerprint_neighbors" + model.neighbor_algorithm_parameters ||= {} + { :type => "FP4", - :training_dataset_id => training_dataset_id, + :training_dataset_id => training_dataset.id, :min_sim => 0.7 - } + }.each do |key,value| + model.neighbor_algorithm_parameters[key] ||= value + end + model.save + model + end + end + + class LazarRegression < Lazar + + def self.create training_dataset, params={} + model = self.new training_dataset, params + #model.neighbor_algorithm ||= "fingerprint_neighbors" + #model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" + #model.neighbor_algorithm_parameters ||= {} + #{ + #:type => "FP4", + #:training_dataset_id => training_dataset.id, + #:min_sim => 0.7 + #}.each do |key,value| + #model.neighbor_algorithm_parameters[key] ||= value + #end + model.save + model end end @@ -159,26 +197,12 @@ module OpenTox end end - class LazarRegression < Lazar - - def initialize - super - #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" - self.neighbor_algorithm = "fingerprint_neighbors" - self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" - self.neighbor_algorithm_parameters = { - :type => "FP4", - :training_dataset_id => self.training_dataset_id, - :min_sim => 0.7 - } - end - end - class Prediction include OpenTox include Mongoid::Document include Mongoid::Timestamps + # TODO cv -> repeated cv # TODO field Validations field :endpoint, type: String field :species, type: String diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb index 676f34a..cf9cbf3 100644 --- a/lib/unique_descriptors.rb +++ b/lib/unique_descriptors.rb @@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib) "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib) "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib) - "Openbabel.L5", #Lipinski Rule of Five + #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! "Openbabel.logP", #octanol/water partition coefficient "Openbabel.MP", #Melting point "Openbabel.MR", #molar refractivity @@ -56,7 +56,7 @@ UNIQUEDESCRIPTORS = [ "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth. "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O - "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms . + #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms . "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration. "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule. "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule. diff --git a/lib/validation.rb b/lib/validation.rb index 63fbd89..9eebef8 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -2,6 +2,7 @@ module OpenTox class Validation + field :model_id, type: BSON::ObjectId field :prediction_dataset_id, type: BSON::ObjectId field :crossvalidation_id, type: BSON::ObjectId field :test_dataset_id, type: BSON::ObjectId @@ -17,9 +18,17 @@ module OpenTox Dataset.find test_dataset_id end + def model + Model::Lazar.find model_id + end + def self.create model, training_set, test_set, crossvalidation=nil - validation_model = model.class.create training_set#, features + atts = model.attributes.dup # do not modify attributes from original model + atts["_id"] = BSON::ObjectId.new + atts[:training_dataset_id] = training_set.id + validation_model = model.class.create training_set, atts + validation_model.save test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities predictions = [] @@ -36,6 +45,7 @@ module OpenTox end end validation = self.new( + :model_id => validation_model.id, :prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, :nr_instances => test_set.compound_ids.size, diff --git a/test/compound.rb b/test/compound.rb index 6a3c696..b33a643 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -134,4 +134,16 @@ print c.sdf end end end + + def test_mna + c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F" + p c.mna 4 + end + + def test_mpd + c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F" + assert 13, c.mpd.size + assert 7, c.mpd.uniq.size + assert_equal c.mpd, c.openbabel_fingerprint("mpd") + end end diff --git a/test/dataset.rb b/test/dataset.rb index 84be547..752073e 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -202,5 +202,15 @@ class DatasetTest < MiniTest::Test assert_equal 0.00323, d2.data_entries[5][0] end + def test_scaled_dataset + original_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv") + scaled_dataset = original_dataset.scale + scaled_dataset.data_entries.each_with_index do |row,i| + row.each_with_index do |value,j| + assert_equal original_dataset.data_entries[i][j].round(4), scaled_dataset.original_value(value,j).round(4) if value # ignore nils + end + end + end + end diff --git a/test/experiment.rb b/test/experiment.rb index 4b54768..76a0498 100644 --- a/test/experiment.rb +++ b/test/experiment.rb @@ -63,21 +63,26 @@ class ExperimentTest < MiniTest::Test end def test_regression_fingerprints +=begin datasets = [ - "LOAEL_mmol_corrected_smiles.csv" + "EPAFHM.medi.csv", + #"LOAEL_mmol_corrected_smiles.csv" ] min_sims = [0.3,0.7] - types = ["FP2","FP3","FP4","MACCS"] + #min_sims = [0.7] + #types = ["FP2","FP3","FP4","MACCS","mpd"] + types = ["mpd","FP3"] experiment = Experiment.create( - :name => "Fminer vs fingerprint classification for datasets #{datasets}.", + :name => "Fingerprint regression with different types for datasets #{datasets}.", :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, ) types.each do |type| min_sims.each do |min_sim| experiment.model_settings << { - :algorithm => "OpenTox::Model::LazarRegression", + :model_algorithm => "OpenTox::Model::LazarRegression", + :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average", :neighbor_algorithm => "fingerprint_neighbors", - :neighbor_algorithm_parameter => { + :neighbor_algorithm_parameters => { :type => type, :min_sim => min_sim, } @@ -85,7 +90,53 @@ class ExperimentTest < MiniTest::Test end end experiment.run - p experiment.report +=end +#=begin + experiment = Experiment.find '56029cb92b72ed673d000000' +#=end + p experiment.id + experiment.results.each do |dataset,result| + result.each do |r| + params = Model::Lazar.find(r["model_id"])[:neighbor_algorithm_parameters] + RepeatedCrossValidation.find(r["repeated_crossvalidation_id"]).crossvalidations.each do |cv| + cv.validation_ids.each do |vid| + model_params = Model::Lazar.find(Validation.find(vid).model_id)[:neighbor_algorithm_parameters] + assert_equal params[:type], model_params[:type] + assert_equal params[:min_sim], model_params[:min_sim] + refute_equal params[:training_dataset_id], model_params[:training_dataset_id] + end + end + end + end + puts experiment.report.to_yaml + p experiment.summary + end + def test_mpd_fingerprints +=begin + datasets = [ + "EPAFHM.medi.csv", + ] + types = ["FP2","mpd"] + experiment = Experiment.create( + :name => "FP2 vs mpd fingerprint regression for datasets #{datasets}.", + :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, + ) + types.each do |type| + experiment.model_settings << { + :algorithm => "OpenTox::Model::LazarRegression", + :neighbor_algorithm => "fingerprint_neighbors", + :neighbor_algorithm_parameter => { + :type => type, + :min_sim => 0.7, + } + } + end + experiment.run + p experiment.id +=end + experiment = Experiment.find '55ffd0c02b72ed123c000000' + p experiment + puts experiment.report.to_yaml end end diff --git a/test/lazar-physchem-short.rb b/test/lazar-physchem-short.rb index e74a4b9..59d8112 100644 --- a/test/lazar-physchem-short.rb +++ b/test/lazar-physchem-short.rb @@ -2,27 +2,29 @@ require_relative "setup.rb" class LazarPhyschemDescriptorTest < MiniTest::Test def test_epafhm - skip "Physchem Regression not yet implemented." - # check available descriptors - @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys - assert_equal 111,@descriptors.size,"wrong number of physchem descriptors" - @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES - # select descriptors for test - @num_features_offset = 0 - @descriptors.keep_if{|x| x=~/^Openbabel\./} - @descriptors.delete("Openbabel.L5") # TODO Openbabel.L5 does not work, investigate!!! - puts "Descriptors: #{@descriptors}" + @descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys + refute_empty @descriptors # UPLOAD DATA training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") - puts "Dataset: "+training_dataset.id -# feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors - model = Model::LazarRegression.create training_dataset#, feature_dataset - #p model + feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors + scaled_feature_dataset = feature_dataset.scale + model = Model::LazarRegression.create training_dataset + model.neighbor_algorithm = "physchem_neighbors" + model.neighbor_algorithm_parameters = { + :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem", + :descriptors => @descriptors, + :feature_dataset_id => scaled_feature_dataset.id, + :min_sim => 0.3 + } + model.save compound = Compound.from_smiles "CC(C)(C)CN" prediction = model.predict compound - p prediction - + refute_nil prediction[:value] + refute_nil prediction[:confidence] + prediction[:neighbors].each do |line| + assert_operator line[1], :>, 0.3 + end end end diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb index cc7f356..8b2d473 100644 --- a/test/lazar-regression.rb +++ b/test/lazar-regression.rb @@ -13,6 +13,16 @@ class LazarRegressionTest < MiniTest::Test assert_equal 1, prediction[:neighbors].size end + def test_mpd_fingerprints + training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" + model = Model::LazarRegression.create training_dataset + model.neighbor_algorithm_parameters[:type] = "mpd" + compound = Compound.from_smiles "CCCSCCSCC" + prediction = model.predict compound + assert_equal 0.04, prediction[:value].round(2) + assert_equal 1, prediction[:neighbors].size + end + def test_local_linear_regression skip training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" diff --git a/test/validation.rb b/test/validation.rb index dfa2c81..9717ccc 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -20,10 +20,25 @@ class ValidationTest < MiniTest::Test end def test_regression_crossvalidation - #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" - dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv" - model = Model::LazarRegression.create dataset + dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" + #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv" + params = { + :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average", + :neighbor_algorithm => "fingerprint_neighbors", + :neighbor_algorithm_parameters => { + :type => "MACCS", + :min_sim => 0.7, + } + } + model = Model::LazarRegression.create dataset, params cv = RegressionCrossValidation.create model + cv.validation_ids.each do |vid| + model = Model::Lazar.find(Validation.find(vid).model_id) + assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type] + assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim] + refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id] + end + #`inkview #{cv.plot}` #puts JSON.pretty_generate(cv.misclassifications)#.collect{|l| l.join ", "}.join "\n" #`inkview #{cv.plot}` @@ -37,12 +52,51 @@ class ValidationTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::LazarClassification.create dataset repeated_cv = RepeatedCrossValidation.create model - p repeated_cv repeated_cv.crossvalidations.each do |cv| - p cv - p cv.accuracy assert cv.accuracy > 0.7 end end + def test_crossvalidation_parameters + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + params = { + :neighbor_algorithm_parameters => { + :min_sim => 0.3, + :type => "FP3" + } + } + model = Model::LazarClassification.create dataset, params + model.save + cv = ClassificationCrossValidation.create model + params = model.neighbor_algorithm_parameters + params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string + cv.validations.each do |validation| + assert_equal params, validation.model.neighbor_algorithm_parameters + end + end + + def test_physchem_regression_crossvalidation + + @descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys + refute_empty @descriptors + + # UPLOAD DATA + training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") + feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors + feature_dataset.save + scaled_feature_dataset = feature_dataset.scale + scaled_feature_dataset.save + model = Model::LazarRegression.create training_dataset + model.neighbor_algorithm = "physchem_neighbors" + model.neighbor_algorithm_parameters = { + :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem", + :descriptors => @descriptors, + :feature_dataset_id => scaled_feature_dataset.id, + :min_sim => 0.3 + } + model.save + cv = RegressionCrossValidation.create model + p cv + end + end -- cgit v1.2.3