From b7cd3ebbb858a8891c35c45896f1bdd525f3534e Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 10 Aug 2015 13:26:06 +0200 Subject: algorithm libraries added, fminer tests pass --- lazar.gemspec | 29 +++++++ lib/algorithm.rb | 21 +++++ lib/bbrc.rb | 159 ++++++++++++++++++++++++++++++++++++++ lib/classification.rb | 107 ++++++++++++++++++++++++++ lib/compound.rb | 2 - lib/crossvalidation.rb | 187 +++++++++++++++++++++++++++++++++++++++++++++ lib/dataset.rb | 4 +- lib/descriptor.rb | 73 +++++++++--------- lib/feature.rb | 3 + lib/lazar.rb | 31 +++++++- lib/neighbor.rb | 25 ++++++ lib/regression.rb | 199 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/similarity.rb | 58 ++++++++++++++ lib/validation.rb | 114 +++++++++++++++++++++++++++ test/descriptor-long.rb | 13 ++++ test/descriptor.rb | 83 ++++++++++++++++++++ test/fminer-long.rb | 37 +++++++++ test/fminer.rb | 46 +++++++++++ 18 files changed, 1146 insertions(+), 45 deletions(-) create mode 100644 lazar.gemspec create mode 100644 lib/algorithm.rb create mode 100644 lib/bbrc.rb create mode 100644 lib/classification.rb create mode 100644 lib/crossvalidation.rb create mode 100644 lib/neighbor.rb create mode 100644 lib/regression.rb create mode 100644 lib/similarity.rb create mode 100644 lib/validation.rb create mode 100644 test/descriptor-long.rb create mode 100644 test/descriptor.rb create mode 100644 test/fminer-long.rb create mode 100644 test/fminer.rb diff --git a/lazar.gemspec b/lazar.gemspec new file mode 100644 index 0000000..3a9a1af --- /dev/null +++ b/lazar.gemspec @@ -0,0 +1,29 @@ +# -*- encoding: utf-8 -*- +$:.push File.expand_path("../lib", __FILE__) + +Gem::Specification.new do |s| + s.name = "opentox-client" + s.version = File.read("./VERSION").strip + s.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler, Denis Gebele"] + s.email = ["helma@in-silico.ch"] + s.homepage = "http://github.com/opentox/lazar" + s.summary = %q{Ruby wrapper for the OpenTox REST API} + s.description = %q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)} + s.license = 'GPL-3' + + s.rubyforge_project = "lazar" + + s.files = `git ls-files`.split("\n") + s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n") + s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) } + s.require_paths = ["lib"] + + # specify any dependencies here; for example: + s.add_runtime_dependency "bundler" + s.add_runtime_dependency "rest-client" + s.add_runtime_dependency 'nokogiri' + s.add_runtime_dependency "openbabel" + s.add_runtime_dependency 'rserve-client' + s.add_runtime_dependency "mongoid", '~> 5.0beta' + +end diff --git a/lib/algorithm.rb b/lib/algorithm.rb new file mode 100644 index 0000000..113f847 --- /dev/null +++ b/lib/algorithm.rb @@ -0,0 +1,21 @@ +module OpenTox + + module Algorithm + + # Generic method to execute algorithms + # Algorithms should: + # - accept a Compound, an Array of Compounds or a Dataset as first argument + # - optional parameters as second argument + # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values + # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object + # @param [Hash] Algorithm parameters + # @return Algorithm result + def self.run algorithm, object, parameters=nil + bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/ + klass,method = algorithm.split('.') + parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) + end + + end +end + diff --git a/lib/bbrc.rb b/lib/bbrc.rb new file mode 100644 index 0000000..6a2eed7 --- /dev/null +++ b/lib/bbrc.rb @@ -0,0 +1,159 @@ +module OpenTox + module Algorithm + class Fminer + TABLE_OF_ELEMENTS = [ +"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"] + + # + # Run bbrc algorithm on dataset + # + # @param [OpenTox::Dataset] training dataset + # @param [optional] parameters BBRC parameters, accepted parameters are + # - min_frequency Minimum frequency (default 5) + # - feature_type Feature type, can be 'paths' or 'trees' (default "trees") + # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true") + # - min_chisq_significance Significance threshold (between 0 and 1) + # - nr_hits Set to "true" to get hit count instead of presence + # - get_target Set to "true" to obtain target variable as feature + # @return [OpenTox::Dataset] Fminer Dataset + def self.bbrc training_dataset, params={} + + time = Time.now + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + + prediction_feature = training_dataset.features.first + if params[:min_frequency] + minfreq = params[:min_frequency] + else + per_mil = 5 # value from latest version + i = training_dataset.feature_ids.index prediction_feature.id + nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size + minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + minfreq = 2 unless minfreq > 2 + minfreq = minfreq.round + end + + @bbrc ||= Bbrc::Bbrc.new + @bbrc.Reset + if prediction_feature.numeric + @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! + else + bad_request_error "No accept values for "\ + "dataset '#{training_dataset.id}' and "\ + "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values + value2act = Hash[[*prediction_feature.accept_values.map.with_index]] + end + @bbrc.SetMinfreq(minfreq) + @bbrc.SetType(1) if params[:feature_type] == "paths" + @bbrc.SetBackbone(false) if params[:backbone] == "false" + @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] + @bbrc.SetConsoleOut(false) + + params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false + feature_dataset = FminerDataset.new( + :training_dataset_id => training_dataset.id, + :training_algorithm => "#{self.to_s}.bbrc", + :training_feature_id => prediction_feature.id , + :training_parameters => { + :min_frequency => minfreq, + :nr_hits => nr_hits, + :backbone => (params[:backbone] == false ? false : true) + } + + ) + feature_dataset.compounds = training_dataset.compounds + + # add data + training_dataset.compounds.each_with_index do |compound,i| + @bbrc.AddCompound(compound.smiles,i+1) + act = value2act[training_dataset.data_entries[i].first] + @bbrc.AddActivity(act,i+1) + end + #g_median=@fminer.all_activities.values.to_scale.median + + #task.progress 10 + #step_width = 80 / @bbrc.GetNoRootNodes().to_f + + $logger.debug "BBRC setup: #{Time.now-time}" + time = Time.now + ftime = 0 + itime = 0 + rtime = 0 + + # run @bbrc + (0 .. @bbrc.GetNoRootNodes()-1).each do |j| + results = @bbrc.MineRoot(j) + results.each do |result| + rt = Time.now + f = YAML.load(result)[0] + smarts = f.shift + # convert fminer SMARTS representation into a more human readable format + smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do + element = TABLE_OF_ELEMENTS[$1.to_i-1] + $2 == "a" ? element.downcase : element + end + p_value = f.shift + f.flatten! + +=begin + if (!@bbrc.GetRegression) + id_arrs = f[2..-1].flatten + max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc + effect = max+1 + else #regression part + id_arrs = f[2] + # DV: effect calculation + f_arr=Array.new + f[2].each do |id| + id=id.keys[0] # extract id from hit count hash + f_arr.push(@fminer.all_activities[id]) + end + f_median=f_arr.to_scale.median + if g_median >= f_median + effect = 'activating' + else + effect = 'deactivating' + end + end +=end + rtime += Time.now - rt + + ft = Time.now + feature = OpenTox::FminerSmarts.find_or_create_by({ + "smarts" => smarts, + "p_value" => p_value.to_f.abs.round(5), + #"effect" => effect, + "dataset_id" => feature_dataset.id + }) + feature_dataset.feature_ids << feature.id + ftime += Time.now - ft + + it = Time.now + f.each do |id_count_hash| + id_count_hash.each do |id,count| + nr_hits ? count = count.to_i : count = 1 + feature_dataset.data_entries[id-1] ||= [] + feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count + end + end + itime += Time.now - it + + end + end + + $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})" + time = Time.now + + feature_dataset.fill_nil_with 0 + + $logger.debug "Prepare save: #{Time.now-time}" + time = Time.now + feature_dataset.save_all + + $logger.debug "Save: #{Time.now-time}" + feature_dataset + + end + end + end +end diff --git a/lib/classification.rb b/lib/classification.rb new file mode 100644 index 0000000..fc6fa77 --- /dev/null +++ b/lib/classification.rb @@ -0,0 +1,107 @@ +module OpenTox + module Algorithm + + class Classification + + def self.weighted_majority_vote neighbors + return [nil,nil] if neighbors.empty? + weighted_sum = {} + sim_sum = 0.0 + neighbors.each do |row| + n,sim,acts = row + acts.each do |act| + weighted_sum[act] ||= 0 + weighted_sum[act] += sim + end + end + case weighted_sum.size + when 1 + return [weighted_sum.keys.first, 1.0] + when 2 + sim_sum = weighted_sum[weighted_sum.keys[0]] + sim_sum -= weighted_sum[weighted_sum.keys[1]] + sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] + confidence = (sim_sum/neighbors.size).abs + return [prediction,confidence] + else + bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" + end + end + + # Classification with majority vote from neighbors weighted by similarity + # @param [Hash] params Keys `:activities, :sims, :value_map` are required + # @return [Numeric] A prediction value. + def self.fminer_weighted_majority_vote neighbors, training_dataset + + neighbor_contribution = 0.0 + confidence_sum = 0.0 + + $logger.debug "Weighted Majority Vote Classification." + + values = neighbors.collect{|n| n[2]}.uniq + neighbors.each do |neighbor| + i = training_dataset.compound_ids.index n.id + neighbor_weight = neighbor[1] + activity = values.index(neighbor[2]) + 1 # map values to integers > 1 + neighbor_contribution += activity * neighbor_weight + if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true + case activity + when 1 + confidence_sum -= neighbor_weight + when 2 + confidence_sum += neighbor_weight + end + else + confidence_sum += neighbor_weight + end + end + if values.size == 2 + if confidence_sum >= 0.0 + prediction = values[1] + elsif confidence_sum < 0.0 + prediction = values[0] + end + elsif values.size == 1 # all neighbors have the same value + prediction = values[0] + else + prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction + end + + confidence = (confidence_sum/neighbors.size).abs + {:value => prediction, :confidence => confidence.abs} + end + + # Local support vector regression from neighbors + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required + # @return [Numeric] A prediction value. + def self.local_svm_classification(params) + + confidence = 0.0 + prediction = nil + + $logger.debug "Local SVM." + if params[:activities].size>0 + if params[:props] + n_prop = params[:props][0].collect.to_a + q_prop = params[:props][1].collect.to_a + props = [ n_prop, q_prop ] + end + activities = params[:activities].collect.to_a + activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification + prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = prediction.sub(/Val/,"") if prediction # Convert back + confidence = 0.0 if prediction.nil? + #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." + confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) + end + {:prediction => prediction, :confidence => confidence} + + end + + + + end + + end +end + diff --git a/lib/compound.rb b/lib/compound.rb index 3ba1670..3418fcc 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -3,8 +3,6 @@ # Could not find contribution data file. CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" -require 'openbabel' -require "base64" module OpenTox diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb new file mode 100644 index 0000000..d926cc4 --- /dev/null +++ b/lib/crossvalidation.rb @@ -0,0 +1,187 @@ +module OpenTox + + class CrossValidation + field :validation_ids, type: Array, default: [] + field :folds, type: Integer + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Array + field :finished_at, type: Time + end + + class ClassificationCrossValidation < CrossValidation + + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + # TODO auc, f-measure (usability??) + + def self.create model, n=10 + cv = self.new + validation_ids = [] + nr_instances = 0 + nr_unpredicted = 0 + predictions = [] + validation_class = Object.const_get(self.to_s.sub(/Cross/,'')) + accept_values = Feature.find(model.prediction_feature_id).accept_values + confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + fold_nr = 1 + training_dataset = Dataset.find model.training_dataset_id + training_dataset.folds(n).each do |fold| + t = Time.now + $logger.debug "Fold #{fold_nr}" + validation = validation_class.create(model, fold[0], fold[1]) + validation_ids << validation.id + nr_instances += validation.nr_instances + nr_unpredicted += validation.nr_unpredicted + predictions += validation.predictions + validation.confusion_matrix.each_with_index do |r,i| + r.each_with_index do |c,j| + confusion_matrix[i][j] += c + weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j] + end + end + $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds" + fold_nr +=1 + end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + cv.update_attributes( + nr_instances: nr_instances, + nr_unpredicted: nr_unpredicted, + accept_values: accept_values, + confusion_matrix: confusion_matrix, + weighted_confusion_matrix: weighted_confusion_matrix, + accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, + weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + true_rate: true_rate, + predictivity: predictivity, + predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence + finished_at: Time.now + ) + cv.save + cv + end + + #Average area under roc 0.646 + #Area under roc 0.646 + #F measure carcinogen: 0.769, noncarcinogen: 0.348 + end + + class RegressionCrossValidation < Validation + + field :validation_ids, type: Array, default: [] + field :folds, type: Integer + field :rmse, type: Float + field :mae, type: Float + field :weighted_rmse, type: Float + field :weighted_mae, type: Float + + def self.create model, n=10 + cv = self.new + validation_ids = [] + nr_instances = 0 + nr_unpredicted = 0 + predictions = [] + validation_class = Object.const_get(self.to_s.sub(/Cross/,'')) + fold_nr = 1 + training_dataset = Dataset.find model.training_dataset_id + training_dataset.folds(n).each do |fold| + t = Time.now + $logger.debug "Predicting fold #{fold_nr}" + + validation = validation_class.create(model, fold[0], fold[1]) + validation_ids << validation.id + nr_instances += validation.nr_instances + nr_unpredicted += validation.nr_unpredicted + predictions += validation.predictions + $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds" + fold_nr +=1 + end + rmse = 0 + weighted_rmse = 0 + rse = 0 + weighted_rse = 0 + mae = 0 + weighted_mae = 0 + rae = 0 + weighted_rae = 0 + n = 0 + confidence_sum = 0 + predictions.each do |pred| + compound_id,activity,prediction,confidence = pred + if activity and prediction + error = prediction-activity + rmse += error**2 + weighted_rmse += confidence*error**2 + mae += error.abs + weighted_mae += confidence*error.abs + n += 1 + confidence_sum += confidence + else + # TODO: create warnings + p pred + end + end + mae = mae/n + weighted_mae = weighted_mae/confidence_sum + rmse = Math.sqrt(rmse/n) + weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) + cv.update_attributes( + folds: n, + validation_ids: validation_ids, + nr_instances: nr_instances, + nr_unpredicted: nr_unpredicted, + predictions: predictions.sort{|a,b| b[3] <=> a[3]}, + mae: mae, + rmse: rmse, + weighted_mae: weighted_mae, + weighted_rmse: weighted_rmse + ) + cv.save + cv + end + + def plot + # RMSE + x = predictions.collect{|p| p[1]} + y = predictions.collect{|p| p[2]} + R.assign "Measurement", x + R.assign "Prediction", y + R.eval "par(pty='s')" # sets the plot type to be square + #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))" + #R.eval "error <- log(Measurement)-log(Prediction)" + R.eval "error <- Measurement-Prediction" + R.eval "rmse <- sqrt(mean(error^2,na.rm=T))" + R.eval "mae <- mean( abs(error), na.rm = TRUE)" + R.eval "r <- cor(log(Prediction),log(Measurement))" + R.eval "svg(filename='/tmp/#{id.to_s}.svg')" + R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)" + #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)" + #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)" + R.eval "abline(0,1,col='blue')" + #R.eval "abline(fitline,col='red')" + R.eval "dev.off()" + "/tmp/#{id.to_s}.svg" + end + end + + +end diff --git a/lib/dataset.rb b/lib/dataset.rb index 0237adf..4f6f0b5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -294,8 +294,8 @@ module OpenTox end =end - private - + # Fill unset data entries + # @param any value def fill_nil_with n (0 .. compound_ids.size-1).each do |i| @data_entries[i] ||= [] diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 68bc7a2..335f3dc 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -1,11 +1,12 @@ require 'digest/md5' ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" -BABEL_3D_CACHE_DIR = File.join(File.dirname(__FILE__),"..",'/babel_3d_cache') # TODO store descriptors in mongodb module OpenTox module Algorithm + + # Class for descriptor calculations class Descriptor include OpenTox @@ -39,6 +40,7 @@ module OpenTox require_relative "unique_descriptors.rb" + # Description of available descriptors def self.description descriptor lib = descriptor.split('.').first case lib @@ -54,6 +56,7 @@ module OpenTox end end + # Match an array of smarts features def self.smarts_match compounds, smarts_features, count=false bad_request_error "Compounds for smarts_match are empty" unless compounds bad_request_error "Smarts features for smarts_match are empty" unless smarts_features @@ -73,7 +76,7 @@ module OpenTox # eg. at line 249 of rat_feature_dataset # which worked with opentox-client # (but no smarts_match) - p "'#{compound.inchi}'" + #p "'#{compound.inchi}'" obconversion.read_string(obmol,compound.inchi) @smarts.each_with_index do |smart,s| smarts_pattern.init(smart) @@ -88,49 +91,20 @@ module OpenTox serialize end + # Count matches of an array with smarts features def self.smarts_count compounds, smarts + # TODO: non-overlapping matches? smarts_match compounds,smarts,true end - def self.serialize - case @input_class - when "OpenTox::Compound" - if @with_names and @physchem_descriptors - [@physchem_descriptors,@data_entries.first] - else - @data_entries.first - end - when "Array" - if @with_names and @physchem_descriptors - [@physchem_descriptors,@data_entries.first] - else - @data_entries - end - when "OpenTox::Dataset" - dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) - if @smarts - dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} - @count ? algo = "count" : algo = "match" - dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" - - elsif @physchem_descriptors - dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} - dataset.data_entries = @data_entries - dataset.feature_calculation_algorithm = "#{self}.physchem" - #TODO params? - end - dataset.save_all - dataset - end - end - - def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS, with_names=false + # Calculate physchem descriptors + # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset + def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS parse compounds @data_entries = Array.new(@compounds.size){[]} @descriptors = descriptors @smarts = nil @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features - @with_names = with_names des = {} @descriptors.each do |d| lib, descriptor = d.split(".",2) @@ -173,7 +147,8 @@ module OpenTox end last_feature_idx = @physchem_descriptors.size YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i| - $logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty? + # TODO create warnings + #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty? # CDK Descriptors may calculate multiple values, they are stored in separate features @physchem_descriptors += calculation.keys if i == 0 calculation.keys.each_with_index do |name,j| @@ -238,6 +213,30 @@ module OpenTox end end + def self.serialize + case @input_class + when "OpenTox::Compound" + @data_entries.first + when "Array" + @data_entries + when "OpenTox::Dataset" + dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) + if @smarts + dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} + @count ? algo = "count" : algo = "match" + dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" + + elsif @physchem_descriptors + dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} + dataset.data_entries = @data_entries + dataset.feature_calculation_algorithm = "#{self}.physchem" + #TODO params? + end + dataset.save_all + dataset + end + end + def self.fix_value val val = val.first if val.is_a? Array and val.size == 1 val = nil if val == "NaN" diff --git a/lib/feature.rb b/lib/feature.rb index 9deb199..b2bc1f5 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -29,6 +29,9 @@ module OpenTox # Feature for SMARTS fragments class Smarts < NominalFeature field :smarts, type: String + def self.from_smarts smarts + self.find_or_create_by :smarts => smarts + end end # Feature for supervised fragments from Fminer algorithm diff --git a/lib/lazar.rb b/lib/lazar.rb index 8831ba2..2e7e7c2 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -6,6 +6,9 @@ require 'json' require 'logger' require 'mongoid' require 'rserve' +require "nokogiri" +require "base64" + # Mongo setup # TODO retrieve correct environment from Rack/Sinatra @@ -27,8 +30,21 @@ Mongo::Logger.logger = $logger Mongo::Logger.level = Logger::WARN #Mongoid.logger = $logger +# Require sub-Repositories +require_relative '../libfminer/libbbrc/bbrc' # include before openbabel +require_relative '../libfminer/liblast/last' # +require_relative '../last-utils/lu.rb' +require 'openbabel' + +# Fminer environment variables +ENV['FMINER_SMARTS'] = 'true' +ENV['FMINER_NO_AROMATIC'] = 'true' +ENV['FMINER_PVALUES'] = 'true' +ENV['FMINER_SILENT'] = 'true' +ENV['FMINER_NR_HITS'] = 'true' + # OpenTox classes and includes -CLASSES = ["Feature","Compound", "Dataset", "Validation", "CrossValidation"]# Algorithm and Models are modules +CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -39,8 +55,15 @@ CLASSES = ["Feature","Compound", "Dataset", "Validation", "CrossValidation"]# A "compound.rb", "dataset.rb", "descriptor.rb", - #"algorithm.rb", - #"model.rb", - #"validation.rb" + "algorithm.rb", + "descriptor.rb", + "bbrc.rb", + "lazar.rb", + "similarity.rb", + "neighbor.rb", + "classification.rb", + "regression.rb", + "validation.rb", + "crossvalidation.rb", ].each{ |f| require_relative f } diff --git a/lib/neighbor.rb b/lib/neighbor.rb new file mode 100644 index 0000000..a2c28d4 --- /dev/null +++ b/lib/neighbor.rb @@ -0,0 +1,25 @@ +module OpenTox + module Algorithm + class Neighbor + + def self.fingerprint_similarity compound, params={} + compound.neighbors params[:min_sim] + end + + def self.fminer_similarity compound, params + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features.collect{|f| f.smarts} ) + neighbors = [] + + # find neighbors + feature_dataset.data_entries.each_with_index do |fingerprint, i| + sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint + if sim > params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + end + end +end diff --git a/lib/regression.rb b/lib/regression.rb new file mode 100644 index 0000000..891d7f9 --- /dev/null +++ b/lib/regression.rb @@ -0,0 +1,199 @@ +# TODO install R packages kernlab, caret, doMC, class, e1071 + + + # log transform activities (create new dataset) + # scale, normalize features, might not be necessary + # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is + # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression + # zero-order correlation and the semi-partial correlation + # seems to be necessary for svm + # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 + # http://stackoverflow.com/questions/15436367/svm-scaling-input-values + # use lasso or elastic net?? + # select relevant features + # remove features with a single value + # remove correlated features + # remove features not correlated with endpoint +module OpenTox + module Algorithm + + class Regression + + def self.weighted_average neighbors + weighted_sum = 0.0 + sim_sum = 0.0 + neighbors.each do |row| + n,sim,acts = row + acts.each do |act| + weighted_sum += sim*Math.log10(act) + sim_sum += sim + end + end + confidence = sim_sum/neighbors.size.to_f + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + [prediction,confidence] + end + + # Local support vector regression from neighbors + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required + # @return [Numeric] A prediction value. + def self.local_svm_regression neighbors, params={:min_train_performance => 0.1} + + confidence = 0.0 + prediction = nil + + $logger.debug "Local SVM." + props = neighbors.collect{|row| row[3] } + neighbors.shift + activities = neighbors.collect{|n| n[2]} + prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = nil if (!prediction.nil? && prediction.infinite?) + $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')." + if prediction + confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities}) + else + confidence = nil if prediction.nil? + end + [prediction, confidence] + + end + + + # Local support vector prediction from neighbors. + # Uses propositionalized setting. + # Not to be called directly (use local_svm_regression or local_svm_classification). + # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] + # @param [Array] activities, activities for neighbors. + # @param [Float] min_train_performance, parameter to control censoring + # @return [Numeric] A prediction value. + def self.local_svm_prop(props, activities, min_train_performance) + + $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)." + n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays. + q_prop = props[0] # is an Array. + + prediction = nil + if activities.uniq.size == 1 + prediction = activities[0] + else + t = Time.now + #$logger.debug gram_matrix.to_yaml + #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests + @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests + rs = [] + ["caret", "doMC", "class"].each do |lib| + #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))" + rs << "suppressPackageStartupMessages(library('#{lib}'))" + end + #@r.eval "registerDoMC()" # switch on parallel processing + rs << "registerDoMC()" # switch on parallel processing + #@r.eval "set.seed(1)" + rs << "set.seed(1)" + $logger.debug "Loading R packages: #{Time.now-t}" + t = Time.now + p n_prop + begin + + # set data + rs << "n_prop <- c(#{n_prop.flatten.join(',')})" + rs << "n_prop <- c(#{n_prop.flatten.join(',')})" + rs << "n_prop_x_size <- c(#{n_prop.size})" + rs << "n_prop_y_size <- c(#{n_prop[0].size})" + rs << "y <- c(#{activities.join(',')})" + rs << "q_prop <- c(#{q_prop.join(',')})" + rs << "y = matrix(y)" + rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" + rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)" + + $logger.debug "Setting R data: #{Time.now-t}" + t = Time.now + # prepare data + rs << " + weights=NULL + if (!(class(y) == 'numeric')) { + y = factor(y) + weights=unlist(as.list(prop.table(table(y)))) + weights=(weights-1)^2 + } + " + + rs << " + rem = nearZeroVar(prop_matrix) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + rem = findCorrelation(cor(prop_matrix)) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + " + + #p @r.eval("y").to_ruby + #p "weights" + #p @r.eval("weights").to_ruby + $logger.debug "Preparing R data: #{Time.now-t}" + t = Time.now + # model + support vectors + #train_success = @r.eval <<-EOR + rs << ' + model = train(prop_matrix,y, + method="svmRadial", + preProcess=c("center", "scale"), + class.weights=weights, + trControl=trainControl(method="LGOCV",number=10), + tuneLength=8 + ) + perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) + ' + File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")} + p rs.join("\n") + p `Rscript /tmp/r.r` +=begin + @r.void_eval <<-EOR + model = train(prop_matrix,y, + method="svmRadial", + #preProcess=c("center", "scale"), + #class.weights=weights, + #trControl=trainControl(method="LGOCV",number=10), + #tuneLength=8 + ) + perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) + EOR +=end + + $logger.debug "Creating R SVM model: #{Time.now-t}" + t = Time.now + if train_success + # prediction + @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice + #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice + @r.eval "if (class(y)!='numeric') p = as.character(p)" + prediction = @r.p + + # censoring + prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) + prediction = nil if prediction =~ /NA/ + $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" + else + $logger.debug "Model creation failed." + prediction = nil + end + $logger.debug "R Prediction: #{Time.now-t}" + rescue Exception => e + $logger.debug "#{e.class}: #{e.message}" + $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + #puts @r.inspect + #TODO: broken pipe + #@r.quit # free R + end + end + prediction + end + end + + end +end + diff --git a/lib/similarity.rb b/lib/similarity.rb new file mode 100644 index 0000000..91e18db --- /dev/null +++ b/lib/similarity.rb @@ -0,0 +1,58 @@ +=begin +* Name: similarity.rb +* Description: Similarity algorithms +* Author: Andreas Maunz 0 and b.size>0 + if a.size>12 && b.size>12 + a = a[0..11] + b = b[0..11] + end + a_vec = a.to_gv + b_vec = b.to_gv + val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm) + end + val + end + + end + + end +end diff --git a/lib/validation.rb b/lib/validation.rb new file mode 100644 index 0000000..bcbe49a --- /dev/null +++ b/lib/validation.rb @@ -0,0 +1,114 @@ +module OpenTox + + class Validation + + field :prediction_dataset_id, type: BSON::ObjectId + field :test_dataset_id, type: BSON::ObjectId + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Array + + def prediction_dataset + Dataset.find prediction_dataset_id + end + + def test_dataset + Dataset.find test_dataset_id + end + + end + + class ClassificationValidation < Validation + field :accept_values, type: String + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + + def self.create model, training_set, test_set + validation = self.class.new + #feature_dataset = Dataset.find model.feature_dataset_id + # TODO check and delegate to Algorithm + #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters + validation_model = model.class.create training_set#, features + test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + prediction_dataset = validation_model.predict test_set_without_activities + accept_values = prediction_dataset.prediction_feature.accept_values + confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + predictions = [] + nr_unpredicted = 0 + prediction_dataset.data_entries.each_with_index do |pe,i| + if pe[0] and pe[1] and pe[1].numeric? + prediction = pe[0] + # TODO prediction_feature, convention?? + # TODO generalize for multiple classes + activity = test_set.data_entries[i].first + confidence = prediction_dataset.data_entries[i][1] + predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence] + if prediction == activity + if prediction == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += confidence + end + elsif prediction != activity + if prediction == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += confidence + end + end + else + nr_unpredicted += 1 if pe[0].nil? + end + end + validation = self.new( + :prediction_dataset_id => prediction_dataset.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.compound_ids.size, + :nr_unpredicted => nr_unpredicted, + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + ) + validation.save + validation + end + end + + class RegressionValidation < Validation + def self.create model, training_set, test_set + + validation_model = Model::LazarRegression.create training_set + test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + prediction_dataset = validation_model.predict test_set_without_activities + predictions = [] + nr_unpredicted = 0 + activities = test_set.data_entries.collect{|de| de.first} + prediction_dataset.data_entries.each_with_index do |de,i| + if de[0] and de[1] and de[1].numeric? + activity = activities[i] + prediction = de.first + confidence = de[1] + predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence] + else + nr_unpredicted += 1 + end + end + validation = self.new( + :prediction_dataset_id => prediction_dataset.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.compound_ids.size, + :nr_unpredicted => nr_unpredicted, + :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + ) + validation.save + validation + end + end + +end diff --git a/test/descriptor-long.rb b/test/descriptor-long.rb new file mode 100644 index 0000000..2752d5a --- /dev/null +++ b/test/descriptor-long.rb @@ -0,0 +1,13 @@ +require_relative "setup.rb" +class DescriptorLongTest < MiniTest::Test + + def test_dataset_all + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv") + d = OpenTox::Algorithm::Descriptor.physchem dataset + assert_equal dataset.compounds, d.compounds + assert_equal 332, d.features.size + assert_equal 332, d.data_entries.first.size + d.delete + end + +end diff --git a/test/descriptor.rb b/test/descriptor.rb new file mode 100644 index 0000000..1143b87 --- /dev/null +++ b/test/descriptor.rb @@ -0,0 +1,83 @@ +require_relative "setup.rb" + +class DescriptorTest < MiniTest::Test + + def test_list + # check available descriptors + @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys + assert_equal 111,@descriptors.size,"wrong num physchem descriptors" + @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES + assert_equal 356,@descriptor_values.size,"wrong num physchem descriptors" + sum = 0 + [ @descriptors, @descriptor_values ].each do |desc| + {"Openbabel"=>16,"Cdk"=>(desc==@descriptors ? 50 : 295),"Joelib"=>45}.each do |k,v| + assert_equal v,desc.select{|x| x=~/^#{k}\./}.size,"wrong num #{k} descriptors" + sum += v + end + end + assert_equal (111+356),sum + end + + def test_smarts + c = OpenTox::Compound.from_smiles "N=C=C1CCC(=F=FO)C1" + s = Smarts.find_or_create_by(:smarts => "FF") + result = OpenTox::Algorithm::Descriptor.smarts_match c, s + assert_equal [1], result + smarts = ["CC", "C", "C=C", "CO", "FF", "C1CCCC1", "NN"].collect{|s| Smarts.find_or_create_by(:smarts => s)} + result = OpenTox::Algorithm::Descriptor.smarts_match c, smarts + assert_equal [1, 1, 1, 0, 1, 1, 0], result + smarts_count = [10, 6, 2, 0, 2, 10, 0] + result = OpenTox::Algorithm::Descriptor.smarts_count c, smarts + assert_equal smarts_count, result + end + + def test_compound_openbabel_single + c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" + result = OpenTox::Algorithm::Descriptor.physchem c, ["Openbabel.logP"] + assert_equal [1.12518], result + end + + def test_compound_cdk_single + c = OpenTox::Compound.from_smiles "c1ccccc1" + result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"] + assert_equal [12], result + c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" + result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"] + assert_equal [17], result + result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.CarbonTypes"] + c_types = {"Cdk.CarbonTypes.C1SP1"=>1, "Cdk.CarbonTypes.C2SP1"=>0, "Cdk.CarbonTypes.C1SP2"=>0, "Cdk.CarbonTypes.C2SP2"=>1, "Cdk.CarbonTypes.C3SP2"=>0, "Cdk.CarbonTypes.C1SP3"=>2, "Cdk.CarbonTypes.C2SP3"=>1, "Cdk.CarbonTypes.C3SP3"=>1, "Cdk.CarbonTypes.C4SP3"=>0} + assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result + end + + def test_compound_joelib_single + c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" + result = OpenTox::Algorithm::Descriptor.physchem c, ["Joelib.LogP"] + assert_equal [2.65908], result + end + + def test_compound_all + c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" + result = OpenTox::Algorithm::Descriptor.physchem c + assert_equal 332, result.size + assert_equal 30.8723, result[2] + assert_equal 1.12518, result[328] + end + + def test_compound_descriptor_parameters + c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" + result = OpenTox::Algorithm::Descriptor.physchem c, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ], true + assert_equal 12, result.last.size + assert_equal ["Openbabel.logP", "Cdk.AtomCount.nAtom", "Cdk.CarbonTypes.C1SP1", "Cdk.CarbonTypes.C2SP1", "Cdk.CarbonTypes.C1SP2", "Cdk.CarbonTypes.C2SP2", "Cdk.CarbonTypes.C3SP2", "Cdk.CarbonTypes.C1SP3", "Cdk.CarbonTypes.C2SP3", "Cdk.CarbonTypes.C3SP3", "Cdk.CarbonTypes.C4SP3", "Joelib.LogP"], result.first + assert_equal [1.12518, 17, 1, 0, 0, 1, 0, 2, 1, 1, 0, 2.65908], result.last + end + + def test_dataset_descriptor_parameters + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv") + d = OpenTox::Algorithm::Descriptor.physchem dataset, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ] + assert_kind_of Dataset, d + assert_equal dataset.compounds, d.compounds + assert_equal dataset.compounds.size, d.data_entries.size + assert_equal 12, d.data_entries.first.size + end + +end diff --git a/test/fminer-long.rb b/test/fminer-long.rb new file mode 100644 index 0000000..826f206 --- /dev/null +++ b/test/fminer-long.rb @@ -0,0 +1,37 @@ +require_relative "setup.rb" + +class FminerTest < MiniTest::Test + + def test_fminer_multicell + skip "multicell segfaults" + # TODO aborts, probably fminer + # or OpenBabel segfault + dataset = OpenTox::Dataset.new + #multi_cell_call.csv + dataset.upload File.join(DATA_DIR,"multi_cell_call.csv") + feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15) + dataset.delete + feature_dataset.delete + end + + def test_fminer_isscan + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"ISSCAN-multi.csv") + feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset)#, :min_frequency => 15) + assert_equal feature_dataset.compounds.size, dataset.compounds.size + p feature_dataset + dataset.delete + feature_dataset.delete + end + + def test_fminer_kazius + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") + # TODO reactivate default settings + feature_dataset = OpenTox::Algorithm::Fminer.bbrc(dataset, :min_frequency => 20) + assert_equal feature_dataset.compounds.size, dataset.compounds.size + feature_dataset = Dataset.find feature_dataset.id + assert feature_dataset.data_entries.size, dataset.compounds.size + dataset.delete + feature_dataset.delete + end + +end diff --git a/test/fminer.rb b/test/fminer.rb new file mode 100644 index 0000000..17dcbe1 --- /dev/null +++ b/test/fminer.rb @@ -0,0 +1,46 @@ +require_relative "setup.rb" + +class FminerTest < MiniTest::Test + + def test_fminer_bbrc + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + refute_nil dataset.id + feature_dataset = OpenTox::Algorithm::Fminer.bbrc dataset + feature_dataset = Dataset.find feature_dataset.id + assert_equal dataset.compounds.size, feature_dataset.compounds.size + assert_equal 54, feature_dataset.features.size + assert_equal "C-C-C=C", feature_dataset.features.first.smarts + compounds = feature_dataset.compounds + smarts = feature_dataset.features + match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts + feature_dataset.data_entries.each_with_index do |fingerprint,i| + assert_equal match[i], fingerprint + end + + dataset.delete + feature_dataset.delete + end + + def test_fminer_last + skip "last features have to be activated" + dataset = OpenTox::Dataset.new + dataset.upload File.join(DATA_DIR,"hamster_carcinogenicity.csv") + feature_dataset = OpenTox::Algorithm::Fminer.last :dataset => dataset + assert_equal dataset.compounds.size, feature_dataset.compounds.size + assert_equal 21, feature_dataset.features.size + assert_equal '[#6&A]-[#6&a]:[#6&a]:[#6&a]:[#6&a]:[#6&a]', feature_dataset.features.first.smarts + + compounds = feature_dataset.compounds + smarts = feature_dataset.features.collect{|f| f.smarts} + match = OpenTox::Algorithm::Descriptor.smarts_match compounds, smarts + compounds.each_with_index do |c,i| + smarts.each_with_index do |s,j| + assert_equal match[i][j], feature_dataset.data_entries[i][j].to_i + end + end + + dataset.delete + feature_dataset.delete + end + +end -- cgit v1.2.3