From d3a4c309d48b794f2f60f44bb9a3d94f402cc82f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 16 Sep 2015 13:11:45 +0200 Subject: repeated crossvalidations, improved experiment reports --- lib/crossvalidation.rb | 57 +++++++++++++++++++++-------------- lib/dataset.rb | 1 + lib/error.rb | 2 +- lib/experiment.rb | 81 +++++++++++++++++++++++++------------------------- lib/lazar.rb | 5 ++-- lib/model.rb | 3 -- test/experiment.rb | 62 +++++++++++++++++++++++++++++--------- test/validation.rb | 12 ++++++++ 8 files changed, 141 insertions(+), 82 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 90c0d75..f480932 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -22,7 +22,9 @@ module OpenTox end def self.create model, n=10 - cv = self.new( + model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation + bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + cv = klass.new( name: model.name, model_id: model.id, folds: n @@ -55,6 +57,7 @@ module OpenTox nr_unpredicted: nr_unpredicted, predictions: predictions ) + cv.statistics cv end end @@ -70,14 +73,13 @@ module OpenTox field :predictivity, type: Hash # TODO auc, f-measure (usability??) - def self.create model, n=10 - cv = super model, n + def statistics accept_values = Feature.find(model.prediction_feature_id).accept_values confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} true_rate = {} predictivity = {} - cv.predictions.each do |pred| + predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction and confidence.numeric? if prediction == activity @@ -113,18 +115,16 @@ module OpenTox confidence_sum += c end end - cv.update_attributes( + update_attributes( accept_values: accept_values, confusion_matrix: confusion_matrix, weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(cv.nr_instances-cv.nr_unpredicted).to_f, + accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, true_rate: true_rate, predictivity: predictivity, finished_at: Time.now ) - cv.save - cv end #Average area under roc 0.646 @@ -142,8 +142,7 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId field :confidence_plot_id, type: BSON::ObjectId - def self.create model, n=10 - cv = super model, n + def statistics rmse = 0 weighted_rmse = 0 rse = 0 @@ -153,7 +152,7 @@ module OpenTox rae = 0 weighted_rae = 0 confidence_sum = 0 - cv.predictions.each do |pred| + predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction error = Math.log10(prediction)-Math.log10(activity) @@ -163,24 +162,24 @@ module OpenTox weighted_mae += confidence*error.abs confidence_sum += confidence else - cv.warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}." + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." end end - x = cv.predictions.collect{|p| p[1]} - y = cv.predictions.collect{|p| p[2]} + x = predictions.collect{|p| p[1]} + y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y R.eval "r <- cor(-log(measurement),-log(prediction))" r = R.eval("r").to_ruby - mae = mae/cv.predictions.size + mae = mae/predictions.size weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/cv.predictions.size) + rmse = Math.sqrt(rmse/predictions.size) weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) # TODO check!! =begin - cv.predictions.sort! do |a,b| + predictions.sort! do |a,b| relative_error_a = (a[1]-a[2]).abs/a[1].to_f relative_error_a = 1/relative_error_a if relative_error_a < 1 relative_error_b = (b[1]-b[2]).abs/b[1].to_f @@ -188,15 +187,14 @@ module OpenTox [relative_error_b,b[3]] <=> [relative_error_a,a[3]] end =end - cv.update_attributes( + update_attributes( mae: mae, rmse: rmse, weighted_mae: weighted_mae, weighted_rmse: weighted_rmse, - r_squared: r**2 + r_squared: r**2, + finished_at: Time.now ) - cv.save - cv end def misclassifications n=nil @@ -277,5 +275,20 @@ module OpenTox end end + class RepeatedCrossValidation + field :crossvalidation_ids, type: Array, default: [] + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + end + repeated_cross_validation.save + repeated_cross_validation + end + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} + end + end + end diff --git a/lib/dataset.rb b/lib/dataset.rb index 851fabd..d884716 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -47,6 +47,7 @@ module OpenTox @data_entries = Marshal.load(data_entry_file.data) bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size + # TODO: data_entries can be empty, poorly reproducible, mongo problem? bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size #$logger.debug "Retrieving data: #{Time.now-t}" end diff --git a/lib/error.rb b/lib/error.rb index 8fe8a1e..39b3c76 100644 --- a/lib/error.rb +++ b/lib/error.rb @@ -58,7 +58,7 @@ module OpenTox OpenTox.const_set error[:class],c # define global methods for raising errors, eg. bad_request_error - Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil| + Object.send(:define_method, error[:method]) do |message| raise c.new(message) end end diff --git a/lib/experiment.rb b/lib/experiment.rb index 2f51756..7849337 100644 --- a/lib/experiment.rb +++ b/lib/experiment.rb @@ -2,45 +2,22 @@ module OpenTox class Experiment field :dataset_ids, type: Array - field :model_algorithms, type: Array - field :model_ids, type: Array, default: [] - field :crossvalidation_ids, type: Array, default: [] - field :prediction_algorithms, type: Array - field :neighbor_algorithms, type: Array - field :neighbor_algorithm_parameters, type: Array + field :model_settings, type: Array + field :results, type: Hash, default: {} end - # TODO more sophisticated experimental design def run dataset_ids.each do |dataset_id| dataset = Dataset.find(dataset_id) - model_algorithms.each do |model_algorithm| - prediction_algorithms.each do |prediction_algorithm| - neighbor_algorithms.each do |neighbor_algorithm| - neighbor_algorithm_parameters.each do |neighbor_algorithm_parameter| - $logger.debug "Creating #{model_algorithm} model for dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}." - model = Object.const_get(model_algorithm).create dataset - model.prediction_algorithm = prediction_algorithm - model.neighbor_algorithm = neighbor_algorithm - model.neighbor_algorithm_parameters = neighbor_algorithm_parameter - model.save - model_ids << model.id - cv = nil - if dataset.features.first.nominal - cv = ClassificationCrossValidation - elsif dataset.features.first.numeric - cv = RegressionCrossValidation - end - if cv - $logger.debug "Creating #{cv} for #{model_algorithm}, dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}." - crossvalidation = cv.create model - self.crossvalidation_ids << crossvalidation.id - else - $logger.warn "#{dataset.features.first} is neither nominal nor numeric." - end - end - end - end + results[dataset_id.to_s] = [] + model_settings.each do |setting| + model = Object.const_get(setting[:algorithm]).create dataset + model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm] + model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm] + model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter] + model.save + repeated_crossvalidation = RepeatedCrossValidation.create model + results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} end end save @@ -54,13 +31,37 @@ module OpenTox end def report - # TODO create ggplot2 report - self.crossvalidation_ids.each do |id| - cv = CrossValidation.find(id) - file = "/tmp/#{id}.svg" - File.open(file,"w+"){|f| f.puts cv.correlation_plot} - `inkview '#{file}'` + # TODO significances + report = {} + report[:name] = name + report[:experiment_id] = self.id.to_s + dataset_ids.each do |dataset_id| + dataset_name = Dataset.find(dataset_id).name + report[dataset_name] = [] + results[dataset_id.to_s].each do |result| + model = Model::Lazar.find(result[:model_id]) + repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) + crossvalidations = repeated_cv.crossvalidations + summary = {} + [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| + summary[key] = model[key] + end + summary[:nr_instances] = crossvalidations.first.nr_instances + summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} + summary[:time] = crossvalidations.collect{|cv| cv.time} + if crossvalidations.first.is_a? ClassificationCrossValidation + summary[:accuracies] = crossvalidations.collect{|cv| cv.accuracy} + elsif crossvalidations.first.is_a? RegressionCrossValidation + summary[:r_squared] = crossvalidations.collect{|cv| cv.r_squared} + end + report[dataset_name] << summary + #p repeated_cv.crossvalidations.collect{|cv| cv.accuracy} + #file = "/tmp/#{id}.svg" + #File.open(file,"w+"){|f| f.puts cv.correlation_plot} + #`inkview '#{file}'` + end end + report end end diff --git a/lib/lazar.rb b/lib/lazar.rb index decbe69..9b02053 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -15,7 +15,8 @@ ENV["MONGOID_ENV"] ||= "development" # TODO remove config files, change default via ENV or directly in Mongoid class Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}") Mongoid.raise_not_found_error = false # return nil if no document is found -$mongo = Mongoid.default_client +$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox') +#$mongo = Mongoid.default_client $gridfs = $mongo.database.fs # R setup @@ -42,7 +43,7 @@ ENV['FMINER_SILENT'] = 'true' ENV['FMINER_NR_HITS'] = 'true' # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", diff --git a/lib/model.rb b/lib/model.rb index 0155fc8..ddb69e4 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,9 +28,6 @@ module OpenTox field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash - #attr_accessor :prediction_dataset - #attr_accessor :training_dataset - # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model diff --git a/test/experiment.rb b/test/experiment.rb index c465d7b..cad4fa7 100644 --- a/test/experiment.rb +++ b/test/experiment.rb @@ -4,27 +4,61 @@ class ExperimentTest < MiniTest::Test def test_regression_experiment datasets = [ - "EPAFHM.csv", - "FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv", + "EPAFHM.medi.csv", + #"EPAFHM.csv", + #"FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv", "LOAEL_mmol_corrected_smiles.csv" + ] + experiment = Experiment.create( + :name => "Default regression for datasets #{datasets}.", + :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, + :model_settings => [ + { + :algorithm => "OpenTox::Model::LazarRegression", + } ] - model_algorithms = ["OpenTox::Model::LazarRegression"] - neighbor_algorithms = ["OpenTox::Algorithm::Neighbor.fingerprint_similarity"] - prediction_algorithms = ["OpenTox::Algorithm::Regression.weighted_average"] - neighbor_algorithm_parameters = [{:min_sim => 0.7}] + ) + experiment.run + puts experiment.report.to_yaml + assert_equal datasets.size, experiment.results.size + experiment.results.each do |dataset_id, result| + assert_equal 1, result.size + result.each do |r| + assert_kind_of BSON::ObjectId, r[:model_id] + assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id] + end + end + end + + def test_classification_experiment + + datasets = [ "hamster_carcinogenicity.csv" ] experiment = Experiment.create( - :name => "Regression for datasets #{datasets}.", + :name => "Fminer vs fingerprint classification for datasets #{datasets}.", :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - :model_algorithms => model_algorithms, - :neighbor_algorithms => neighbor_algorithms, - :neighbor_algorithm_parameters => neighbor_algorithm_parameters, - :prediction_algorithms => prediction_algorithms, + :model_settings => [ + { + :algorithm => "OpenTox::Model::LazarClassification", + },{ + :algorithm => "OpenTox::Model::LazarClassification", + :neighbor_algorithm_parameter => {:min_sim => 0.3} + }, + #{ + #:algorithm => "OpenTox::Model::LazarFminerClassification", + #} + ] ) experiment.run =begin - p experiment - experiment.report + experiment = Experiment.find "55f944a22b72ed7de2000000" =end - refute_empty experiment.crossvalidation_ids + puts experiment.report.to_yaml + experiment.results.each do |dataset_id, result| + assert_equal 2, result.size + result.each do |r| + assert_kind_of BSON::ObjectId, r[:model_id] + assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id] + end + end end end diff --git a/test/validation.rb b/test/validation.rb index a4c3d80..dfa2c81 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -33,4 +33,16 @@ class ValidationTest < MiniTest::Test #assert cv.weighted_mae < cv.mae end + def test_repeated_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::LazarClassification.create dataset + repeated_cv = RepeatedCrossValidation.create model + p repeated_cv + repeated_cv.crossvalidations.each do |cv| + p cv + p cv.accuracy + assert cv.accuracy > 0.7 + end + end + end -- cgit v1.2.3 From 2fdecbed76c4db8dfe3f10f825fed9772e653197 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 16 Sep 2015 16:52:18 +0200 Subject: generic openbabel fingerprints --- lib/compound.rb | 31 +++++++++++++++++++++++++++++-- test/compound.rb | 11 +++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 6adf3c0..7f175ca 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -42,6 +42,35 @@ module OpenTox compound end + def openbabel_fingerprint type="FP2" + fp = OpenBabel::OBFingerprint.find_fingerprint(type) + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format "smi" + obconversion.read_string obmol, smiles + result = OpenBabel::VectorUnsignedInt.new + fp.get_fingerprint(obmol,result) + # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i + #p OpenBabel::OBFingerprint.describe_bits(result) + result = result.to_a + # convert result to a list of the bits that are set + # from openbabel/scripts/python/pybel.py line 830 + # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints + bitsperint = OpenBabel::OBFingerprint.getbitsperint() + bits_set = [] + start = 1 + result.each do |x| + i = start + while x > 0 do + bits_set << i if (x % 2) == 1 + x >>= 1 + i += 1 + end + start += bitsperint + end + bits_set + end + # Create a compound from smiles string # @example # compound = OpenTox::Compound.from_smiles("c1ccccc1") @@ -202,8 +231,6 @@ module OpenTox $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } end -=begin -=end private diff --git a/test/compound.rb b/test/compound.rb index 06c19a2..6deba4e 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -97,4 +97,15 @@ print c.sdf c = Compound.from_inchi(inchi) assert_equal inchi, c.inchi end + + def test_openbabel_fingerprint + [ + "CC(=O)CC(C)C#N", + "CC(=O)CC(C)C", + "C(=O)CC(C)C#N", + ].each do |smi| + c = OpenTox::Compound.from_smiles smi + assert_equal c.openbabel_fingerprint("FP4").size, c.fp4.size + end + end end -- cgit v1.2.3 From 6ac119c32cef094d4f1c2fb5c2daa4e274401f70 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 17 Sep 2015 14:56:25 +0200 Subject: neighbor calculation moved to Compound class --- lib/compound.rb | 84 +++++++++++++++++++++++++++++++++++--------------- lib/crossvalidation.rb | 3 +- lib/dataset.rb | 2 +- lib/experiment.rb | 4 +-- lib/lazar.rb | 2 +- lib/model.rb | 24 +++++++++++---- lib/neighbor.rb | 25 --------------- lib/opentox.rb | 1 - test/compound.rb | 26 ++++++++++++++++ test/experiment.rb | 31 +++++++++++++++++-- 10 files changed, 139 insertions(+), 63 deletions(-) delete mode 100644 lib/neighbor.rb diff --git a/lib/compound.rb b/lib/compound.rb index 7f175ca..7abd913 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -19,8 +19,11 @@ module OpenTox field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId + field :fp2, type: Array + field :fp3, type: Array field :fp4, type: Array field :fp4_size, type: Integer + field :maccs, type: Array index({smiles: 1}, {unique: true}) @@ -43,32 +46,35 @@ module OpenTox end def openbabel_fingerprint type="FP2" - fp = OpenBabel::OBFingerprint.find_fingerprint(type) - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format "smi" - obconversion.read_string obmol, smiles - result = OpenBabel::VectorUnsignedInt.new - fp.get_fingerprint(obmol,result) - # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i - #p OpenBabel::OBFingerprint.describe_bits(result) - result = result.to_a - # convert result to a list of the bits that are set - # from openbabel/scripts/python/pybel.py line 830 - # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints - bitsperint = OpenBabel::OBFingerprint.getbitsperint() - bits_set = [] - start = 1 - result.each do |x| - i = start - while x > 0 do - bits_set << i if (x % 2) == 1 - x >>= 1 - i += 1 + unless self.send(type.downcase.to_sym) # stored fingerprint + fp = OpenBabel::OBFingerprint.find_fingerprint(type) + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format "smi" + obconversion.read_string obmol, smiles + result = OpenBabel::VectorUnsignedInt.new + fp.get_fingerprint(obmol,result) + # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i + #p OpenBabel::OBFingerprint.describe_bits(result) + # convert result to a list of the bits that are set + # from openbabel/scripts/python/pybel.py line 830 + # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints + result = result.to_a + bitsperint = OpenBabel::OBFingerprint.getbitsperint() + bits_set = [] + start = 1 + result.each do |x| + i = start + while x > 0 do + bits_set << i if (x % 2) == 1 + x >>= 1 + i += 1 + end + start += bitsperint end - start += bitsperint + update type.downcase.to_sym, bits_set end - bits_set + self.send(type.downcase.to_sym) end # Create a compound from smiles string @@ -206,6 +212,36 @@ module OpenTox self["chemblid"] end + def fingerprint_neighbors params + bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] + neighbors = [] + query_fingerprint = self.openbabel_fingerprint params[:type] + training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| + unless self == compound + fingerprint = compound.openbabel_fingerprint params[:type] + sim = (query_fingerprint & fingerprint).size/(query_fingerprint | fingerprint).size.to_f + neighbors << [compound.id, sim] if sim >= params[:min_sim] + end + end + neighbors.sort{|a,b| b.last <=> a.last} + end + + def fminer_neighbors params + bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim] + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features) + neighbors = [] + + # find neighbors + feature_dataset.data_entries.each_with_index do |fingerprint, i| + sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint + if sim >= params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + def neighbors threshold=0.7 # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index f480932..337b434 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -279,7 +279,8 @@ module OpenTox field :crossvalidation_ids, type: Array, default: [] def self.create model, folds=10, repeats=3 repeated_cross_validation = self.new - repeats.times do + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id end repeated_cross_validation.save diff --git a/lib/dataset.rb b/lib/dataset.rb index d884716..7d889f8 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -152,7 +152,7 @@ module OpenTox name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset - $logger.debug "Skipping #{file}, it is already in the database (id: #{dataset.id})." + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." else $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true diff --git a/lib/experiment.rb b/lib/experiment.rb index 7849337..985a491 100644 --- a/lib/experiment.rb +++ b/lib/experiment.rb @@ -2,7 +2,7 @@ module OpenTox class Experiment field :dataset_ids, type: Array - field :model_settings, type: Array + field :model_settings, type: Array, default: [] field :results, type: Hash, default: {} end @@ -26,7 +26,7 @@ module OpenTox def self.create params experiment = self.new $logge.debug "Experiment started ..." - experiment.run params + #experiment.run params experiment end diff --git a/lib/lazar.rb b/lib/lazar.rb index 9b02053..89b50f7 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -59,7 +59,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat "bbrc.rb", "model.rb", "similarity.rb", - "neighbor.rb", + #"neighbor.rb", "classification.rb", "regression.rb", "validation.rb", diff --git a/lib/model.rb b/lib/model.rb index ddb69e4..9892f64 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -39,6 +39,7 @@ module OpenTox prediction_feature = training_dataset.features.first prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new lazar.training_dataset_id = training_dataset.id + lazar.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id lazar.prediction_feature_id = prediction_feature.id lazar.name = "#{training_dataset.name} #{prediction_feature.name}" @@ -78,7 +79,8 @@ module OpenTox predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} next end - neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) + neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) + #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) # add activities # TODO: improve efficiency, takes 3 times longer than previous version neighbors.collect! do |n| @@ -129,8 +131,12 @@ module OpenTox def initialize super self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} + self.neighbor_algorithm = "fingerprint_neighbors" + self.neighbor_algorithm_parameters = { + :type => "FP4", + :training_dataset_id => training_dataset_id, + :min_sim => 0.7 + } end end @@ -141,7 +147,7 @@ module OpenTox model = super(training_dataset) model.update "_type" => self.to_s # adjust class model = self.find model.id # adjust class - model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" + model.neighbor_algorithm = "fminer_neighbors" model.neighbor_algorithm_parameters = { :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id, @@ -154,11 +160,17 @@ module OpenTox end class LazarRegression < Lazar + def initialize super - self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + #self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.neighbor_algorithm = "fingerprint_neighbors" self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" - self.neighbor_algorithm_parameters = {:min_sim => 0.7} + self.neighbor_algorithm_parameters = { + :type => "FP4", + :training_dataset_id => self.training_dataset_id, + :min_sim => 0.7 + } end end diff --git a/lib/neighbor.rb b/lib/neighbor.rb deleted file mode 100644 index d849cbf..0000000 --- a/lib/neighbor.rb +++ /dev/null @@ -1,25 +0,0 @@ -module OpenTox - module Algorithm - class Neighbor - - def self.fingerprint_similarity compound, params={} - compound.neighbors params[:min_sim] - end - - def self.fminer_similarity compound, params - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features) - neighbors = [] - - # find neighbors - feature_dataset.data_entries.each_with_index do |fingerprint, i| - sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint - if sim > params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming - end - end - neighbors - end - end - end -end diff --git a/lib/opentox.rb b/lib/opentox.rb index 875487c..186c87a 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -14,7 +14,6 @@ module OpenTox store_in collection: klass.downcase.pluralize field :name, type: String field :warnings, type: Array, default: [] - end OpenTox.const_set klass,c end diff --git a/test/compound.rb b/test/compound.rb index 6deba4e..6a3c696 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -108,4 +108,30 @@ print c.sdf assert_equal c.openbabel_fingerprint("FP4").size, c.fp4.size end end + + def test_fingerprint_neighbors + types = ["FP2", "FP3", "FP4", "MACCS"] + min_sim = 0.7 + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv") + [ + "CC(=O)CC(C)C#N", + "CC(=O)CC(C)C", + "C(=O)CC(C)C#N", + ].each do |smi| + c = OpenTox::Compound.from_smiles smi + p c.smiles + types.each do |type| + p type + neighbors = c.fingerprint_neighbors({:type => type, :training_dataset_id => training_dataset.id, :min_sim => min_sim}) + p neighbors.collect{|n| [Compound.find(n.first).smiles,n.last]} + if type == "FP4" + fp4_neighbors = c.neighbors + neighbors.each do |n| + p [Compound.find(n.first).smiles,n.last] unless fp4_neighbors.include?(n) + assert_includes fp4_neighbors, n + end + end + end + end + end end diff --git a/test/experiment.rb b/test/experiment.rb index cad4fa7..4b54768 100644 --- a/test/experiment.rb +++ b/test/experiment.rb @@ -18,7 +18,7 @@ class ExperimentTest < MiniTest::Test } ] ) - experiment.run + #experiment.run puts experiment.report.to_yaml assert_equal datasets.size, experiment.results.size experiment.results.each do |dataset_id, result| @@ -48,7 +48,7 @@ class ExperimentTest < MiniTest::Test #} ] ) - experiment.run + #experiment.run =begin experiment = Experiment.find "55f944a22b72ed7de2000000" =end @@ -61,4 +61,31 @@ class ExperimentTest < MiniTest::Test end end end + + def test_regression_fingerprints + datasets = [ + "LOAEL_mmol_corrected_smiles.csv" + ] + min_sims = [0.3,0.7] + types = ["FP2","FP3","FP4","MACCS"] + experiment = Experiment.create( + :name => "Fminer vs fingerprint classification for datasets #{datasets}.", + :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, + ) + types.each do |type| + min_sims.each do |min_sim| + experiment.model_settings << { + :algorithm => "OpenTox::Model::LazarRegression", + :neighbor_algorithm => "fingerprint_neighbors", + :neighbor_algorithm_parameter => { + :type => type, + :min_sim => min_sim, + } + } + end + end + experiment.run + p experiment.report + + end end -- cgit v1.2.3 From 33989261450bba279b4e002e5e4ea0475d742abb Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 18 Sep 2015 13:01:59 +0200 Subject: fix for empty values --- lib/dataset.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/dataset.rb b/lib/dataset.rb index 7d889f8..00e2bc3 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -203,7 +203,7 @@ module OpenTox feature = NominalFeature.find_or_create_by(metadata) end end - feature_ids << feature.id + feature_ids << feature.id if feature end $logger.debug "Feature values: #{Time.now-time}" @@ -245,7 +245,7 @@ module OpenTox end compound_ids << compound.id - @data_entries << Array.new(table.first.size-1) + @data_entries << Array.new(table.first.size-1) if (table.first.size-1) > 0 vals.each_with_index do |v,j| if v.blank? -- cgit v1.2.3