From 0c5d2e678908a2d4aea43efbedbedc2c0439be30 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 15:25:50 +0100 Subject: descriptor tests --- ext/lazar/extconf.rb | 36 +------ lib/bbrc.rb | 165 ----------------------------- lib/classification.rb | 1 - lib/compound.rb | 67 +++++------- lib/crossvalidation.rb | 1 - lib/dataset.rb | 2 - lib/descriptor.rb | 252 --------------------------------------------- lib/feature.rb | 9 -- lib/lazar.rb | 8 -- lib/model.rb | 3 - lib/overwrite.rb | 6 +- lib/physchem.rb | 4 + lib/regression.rb | 3 +- lib/rest-client-wrapper.rb | 1 - lib/similarity.rb | 58 ----------- lib/validation.rb | 10 -- test/compound.rb | 3 +- test/dataset.rb | 2 +- test/descriptor.rb | 68 +++++------- 19 files changed, 61 insertions(+), 638 deletions(-) delete mode 100644 lib/bbrc.rb delete mode 100644 lib/descriptor.rb delete mode 100644 lib/similarity.rb diff --git a/ext/lazar/extconf.rb b/ext/lazar/extconf.rb index edb960a..a76f0f4 100644 --- a/ext/lazar/extconf.rb +++ b/ext/lazar/extconf.rb @@ -5,11 +5,10 @@ main_dir = File.expand_path(File.join(File.dirname(__FILE__),"..","..")) # install OpenBabel - openbabel_version = "2.3.2" openbabel_dir = File.join main_dir, "openbabel" -src_dir = openbabel_dir #File.join openbabel_dir, "openbabel-#{openbabel_version}" +src_dir = openbabel_dir build_dir = File.join src_dir, "build" install_dir = openbabel_dir install_lib_dir = File.join install_dir, "lib" @@ -52,37 +51,4 @@ end ob_include= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/include/openbabel-2.0") ob_lib= File.expand_path File.join(File.dirname(__FILE__),"../../openbabel/lib") -# compile ruby bindings -=begin -puts "Compiling and installing OpenBabel Ruby bindings." -Dir.chdir ruby_src_dir do - # fix rpath - system "sed -i 's|with_ldflags.*$|with_ldflags(\"#\$LDFLAGS -dynamic -Wl,-rpath,#{install_lib_dir}\") do|' #{File.join(ruby_src_dir,'extconf.rb')}" - system "#{RbConfig.ruby} extconf.rb --with-openbabel-include=#{ob_include} --with-openbabel-lib=#{ob_lib}" - system "make -j#{nr_processors}" -end -=end - -# install fminer -fminer_dir = File.join main_dir, "libfminer" -system "git clone git://github.com/amaunz/fminer2.git #{fminer_dir}" - -["libbbrc","liblast"].each do |lib| - FileUtils.cd File.join(fminer_dir,lib) - system "sed -i 's,^INCLUDE_OB.*,INCLUDE_OB\ =\ #{ob_include},g' Makefile" - system "sed -i 's,^LDFLAGS_OB.*,LDFLAGS_OB\ =\ #{ob_lib},g' Makefile" - system "sed -i 's,^INCLUDE_RB.*,INCLUDE_RB\ =\ #{RbConfig::CONFIG['rubyhdrdir']},g' Makefile" - # TODO fix in fminer Makefile - system "sed -i 's,-g, -g -I #{RbConfig::CONFIG['rubyhdrdir']} -I #{RbConfig::CONFIG['rubyarchhdrdir']} -I,' Makefile" # fix include path (CH) - system "sed -i '74s/$(CC)/$(CC) -Wl,-rpath,#{ob_lib.gsub('/','\/')} -L/' Makefile" # fix library path (CH) - system "make ruby" -end - -# install last-utils -FileUtils.cd main_dir -system "git clone git://github.com/amaunz/last-utils.git" -FileUtils.cd File.join(main_dir,"last-utils") -`sed -i '8s/"openbabel", //' lu.rb` - -# install R packagemain_dir $makefile_created = true diff --git a/lib/bbrc.rb b/lib/bbrc.rb deleted file mode 100644 index 4594f68..0000000 --- a/lib/bbrc.rb +++ /dev/null @@ -1,165 +0,0 @@ -module OpenTox - module Algorithm - class Fminer - TABLE_OF_ELEMENTS = [ -"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"] - - # - # Run bbrc algorithm on dataset - # - # @param [OpenTox::Dataset] training dataset - # @param [optional] parameters BBRC parameters, accepted parameters are - # - min_frequency Minimum frequency (default 5) - # - feature_type Feature type, can be 'paths' or 'trees' (default "trees") - # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true") - # - min_chisq_significance Significance threshold (between 0 and 1) - # - nr_hits Set to "true" to get hit count instead of presence - # - get_target Set to "true" to obtain target variable as feature - # @return [OpenTox::Dataset] Fminer Dataset - def self.bbrc training_dataset, params={} - - time = Time.now - bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 - - prediction_feature = training_dataset.features.first - if params[:min_frequency] - minfreq = params[:min_frequency] - else - per_mil = 5 # value from latest version - per_mil = 8 # as suggested below - i = training_dataset.feature_ids.index prediction_feature.id - nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size - minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST - minfreq = 2 unless minfreq > 2 - minfreq = minfreq.round - end - - @bbrc ||= Bbrc::Bbrc.new - @bbrc.Reset - if prediction_feature.numeric - @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! - else - bad_request_error "No accept values for "\ - "dataset '#{training_dataset.id}' and "\ - "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values - value2act = Hash[[*prediction_feature.accept_values.map.with_index]] - end - @bbrc.SetMinfreq(minfreq) - @bbrc.SetType(1) if params[:feature_type] == "paths" - @bbrc.SetBackbone(false) if params[:backbone] == "false" - @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] - @bbrc.SetConsoleOut(false) - - params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false - feature_dataset = FminerDataset.new( - :training_dataset_id => training_dataset.id, - :training_algorithm => "#{self.to_s}.bbrc", - :training_feature_id => prediction_feature.id , - :training_parameters => { - :min_frequency => minfreq, - :nr_hits => nr_hits, - :backbone => (params[:backbone] == false ? false : true) - } - - ) - feature_dataset.compounds = training_dataset.compounds - - # add data - training_dataset.compounds.each_with_index do |compound,i| - act = value2act[training_dataset.data_entries[i].first] - if act # TODO check if this works - @bbrc.AddCompound(compound.smiles,i+1) - @bbrc.AddActivity(act,i+1) - end - end - #g_median=@fminer.all_activities.values.to_scale.median - - #task.progress 10 - #step_width = 80 / @bbrc.GetNoRootNodes().to_f - - $logger.debug "BBRC setup: #{Time.now-time}" - time = Time.now - ftime = 0 - itime = 0 - rtime = 0 - - # run @bbrc - (0 .. @bbrc.GetNoRootNodes()-1).each do |j| - results = @bbrc.MineRoot(j) - results.each do |result| - rt = Time.now - f = YAML.load(result)[0] - smarts = f.shift - # convert fminer SMARTS representation into a more human readable format - smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do - element = TABLE_OF_ELEMENTS[$1.to_i-1] - $2 == "a" ? element.downcase : element - end - p_value = f.shift - f.flatten! - compound_idxs = f.collect{|e| e.first.first-1} - # majority class - effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode - -=begin - if (!@bbrc.GetRegression) - id_arrs = f[2..-1].flatten - max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc - effect = max+1 - else #regression part - id_arrs = f[2] - # DV: effect calculation - f_arr=Array.new - f[2].each do |id| - id=id.keys[0] # extract id from hit count hash - f_arr.push(@fminer.all_activities[id]) - end - f_median=f_arr.to_scale.median - if g_median >= f_median - effect = 'activating' - else - effect = 'deactivating' - end - end -=end - rtime += Time.now - rt - - ft = Time.now - feature = OpenTox::FminerSmarts.find_or_create_by({ - "smarts" => smarts, - "p_value" => p_value.to_f.abs.round(5), - "effect" => effect, - "dataset_id" => feature_dataset.id - }) - feature_dataset.feature_ids << feature.id - ftime += Time.now - ft - - it = Time.now - f.each do |id_count_hash| - id_count_hash.each do |id,count| - nr_hits ? count = count.to_i : count = 1 - feature_dataset.data_entries[id-1] ||= [] - feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count - end - end - itime += Time.now - it - - end - end - - $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})" - time = Time.now - - feature_dataset.fill_nil_with 0 - - $logger.debug "Prepare save: #{Time.now-time}" - time = Time.now - feature_dataset.save - - $logger.debug "Save: #{Time.now-time}" - feature_dataset - - end - end - end -end diff --git a/lib/classification.rb b/lib/classification.rb index 7a225bb..abbb5b3 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -92,7 +92,6 @@ module OpenTox prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting prediction = prediction.sub(/Val/,"") if prediction # Convert back confidence = 0.0 if prediction.nil? - #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) end {:value => prediction, :confidence => confidence} diff --git a/lib/compound.rb b/lib/compound.rb index 8c11831..2a79fd6 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,7 +1,3 @@ -# TODO: check -# *** Open Babel Error in ParseFile -# Could not find contribution data file. - CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox @@ -9,7 +5,6 @@ module OpenTox class Compound require_relative "unique_descriptors.rb" include OpenTox - include OpenTox::Descriptor DEFAULT_FINGERPRINT = "MP2D" @@ -22,7 +17,6 @@ module OpenTox field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId - field :molecular_weight, type: Float field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer field :physchem_descriptors, type: Hash, default: {} @@ -30,7 +24,6 @@ module OpenTox field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) - #index({default_fingerprint: 1}, {unique: false}) # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params @@ -106,7 +99,24 @@ module OpenTox end end save - physchem_descriptors + physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + end + + def smarts_match smarts, count=false + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('smi') + obconversion.read_string(obmol,self.smiles) + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts.collect do |sma| + smarts_pattern.init(sma.smarts) + if smarts_pattern.match(obmol) + count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 + else + value = 0 + end + value + end end # Create a compound from smiles string @@ -281,34 +291,16 @@ module OpenTox training_dataset = Dataset.find(params[:training_dataset_id]) prediction_feature = training_dataset.features.first training_dataset.compounds.each do |compound| - #unless self == compound - candidate_fingerprint = compound.fingerprint params[:type] - sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] - #end + candidate_fingerprint = compound.fingerprint params[:type] + sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f + feature_values = training_dataset.values(compound,prediction_feature) + neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end neighbors end - def fminer_neighbors params - bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim] - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features) - neighbors = [] - - # find neighbors - feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| - sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint - if sim >= params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming - end - end - neighbors - end - def physchem_neighbors params feature_dataset = Dataset.find params[:feature_dataset_id] query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] @@ -317,13 +309,7 @@ module OpenTox # TODO implement pearson and cosine similarity separatly R.assign "x", query_fingerprint R.assign "y", candidate_fingerprint - # pearson r - #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby - #p "pearson" - #p sim - #p "cosine" sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first - #p sim if sim >= params[:min_sim] neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming end @@ -357,9 +343,6 @@ module OpenTox ] $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} - - - #$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } end @@ -378,10 +361,8 @@ module OpenTox # Calculate molecular weight of Compound with OB and store it in object # @return [Float] molecular weight def molecular_weight - if self["molecular_weight"]==0.0 || self["molecular_weight"].nil? - update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first) - end - self["molecular_weight"].to_f + mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") + physchem([mw_feature])[mw_feature.id.to_s] end private diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index ea32a2b..cd94e33 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -55,7 +55,6 @@ module OpenTox predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence ) $logger.debug "Nr unpredicted: #{nr_unpredicted}" - #cv.statistics cv end end diff --git a/lib/dataset.rb b/lib/dataset.rb index b9c2187..af851b5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -132,7 +132,6 @@ module OpenTox end end - # Parsers # Create a dataset from file (csv,sdf,...) @@ -211,7 +210,6 @@ module OpenTox value_time = 0 # compounds and values - #@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)} self.data_entries = [] table.each_with_index do |vals,i| diff --git a/lib/descriptor.rb b/lib/descriptor.rb deleted file mode 100644 index 14a123b..0000000 --- a/lib/descriptor.rb +++ /dev/null @@ -1,252 +0,0 @@ -require 'digest/md5' -ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" -# TODO store descriptors in mongodb - -module OpenTox - - #module Algorithm - - # Class for descriptor calculations - module Descriptor - include OpenTox - - JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") - CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last - JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") - LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") - JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") - - obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] - OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| - name,description = d.split(/\s+/,2) - ["Openbabel_"+name,description] unless obexclude.include? name - end.compact.sort{|a,b| a[0] <=> b[0]}] - - cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) - CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] - CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"_"+name } }.flatten - - # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) - joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] - # strip Joelib messages from stdout - JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| - name = d[:java_class].sub(/^joelib2.feature.types./,'').gsub(/\./,"_") - ["Joelib_"+name, "impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java"] unless joelibexclude.include? name - end.compact.sort{|a,b| a[0] <=> b[0]}] - - DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) - DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys - - require_relative "unique_descriptors.rb" - - # Description of available descriptors - def self.description descriptor - lib = descriptor.split('_').first - case lib - when "Openbabel" - OBDESCRIPTORS[descriptor] - when "Cdk" - name = descriptor.split('_')[0..-2].join('_') - CDKDESCRIPTORS[name] - when "Joelib" - JOELIBDESCRIPTORS[descriptor] - when "lookup" - "Read feature values from a dataset" - end - end - - # Match an array of smarts features - def self.smarts_match compounds, smarts_features, count=false - bad_request_error "Compounds for smarts_match are empty" unless compounds - bad_request_error "Smarts features for smarts_match are empty" unless smarts_features - parse compounds - @count = count - obconversion = OpenBabel::OBConversion.new - obmol = OpenBabel::OBMol.new - obconversion.set_in_format('smi') - smarts_pattern = OpenBabel::OBSmartsPattern.new - smarts_features = [smarts_features] if smarts_features.is_a?(Feature) - @smarts = smarts_features.collect{|f| f.smarts} - @physchem_descriptors = nil - @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)} - @compounds.each_with_index do |compound,c| - obconversion.read_string(obmol,compound.smiles) - @smarts.each_with_index do |smart,s| - smarts_pattern.init(smart) - if smarts_pattern.match(obmol) - count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 - else - value = 0 - end - @data_entries[c][s] = value - end - end - serialize - end - - # Count matches of an array with smarts features - def self.smarts_count compounds, smarts - # TODO: non-overlapping matches? - smarts_match compounds,smarts,true - end - - # Calculate physchem descriptors - # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset - def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS - parse compounds - @data_entries = Array.new(@compounds.size){[]} - @descriptors = descriptors - @smarts = nil - @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features - des = {} - @descriptors.each do |d| - lib, descriptor = d.split("_",2) - lib = lib.downcase.to_sym - des[lib] ||= [] - des[lib] << descriptor - end - des.each do |lib,descriptors| - send(lib, descriptors) - end - serialize - end - - def self.openbabel descriptors - $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds" - obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format 'smi' - last_feature_idx = @physchem_descriptors.size - @compounds.each_with_index do |compound,c| - obconversion.read_string obmol, compound.smiles - obdescriptors.each_with_index do |descriptor,d| - @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) - end - end - @physchem_descriptors += descriptors.collect{|d| "Openbabel_#{d}"} - end - - def self.java_descriptors descriptors, lib - $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds" - sdf = sdf_3d - # use java system call (rjb blocks within tasks) - # use Tempfiles to avoid "Argument list too long" error - case lib - when "cdk" - run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}" - when "joelib" - run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}" - end - last_feature_idx = @physchem_descriptors.size - YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i| - # TODO create warnings - #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty? - # CDK Descriptors may calculate multiple values, they are stored in separate features - @physchem_descriptors += calculation.keys if i == 0 - calculation.keys.each_with_index do |name,j| - @data_entries[i][j+last_feature_idx] = fix_value(calculation[name]) - end - end - FileUtils.rm "#{sdf}#{lib}.yaml" - end - - def self.cdk descriptors - java_descriptors descriptors, "cdk" - end - - def self.joelib descriptors - java_descriptors descriptors, "joelib" - end - - def self.lookup compounds, features, dataset - parse compounds - fingerprint = [] - compounds.each do |compound| - fingerprint << [] - features.each do |feature| - end - end - end - - def self.run_cmd cmd - cmd = "#{cmd} 2>&1" - $logger.debug "running external cmd: '#{cmd}'" - p = IO.popen(cmd) do |io| - while line = io.gets - $logger.debug "> #{line.chomp}" - end - io.close - raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0 - end - end - - def self.sdf_3d - # TODO check if 3d sdfs are stored in GridFS - sdf = "" - @compounds.each do |compound| - sdf << compound.sdf - end - sdf_file = "/tmp/#{SecureRandom.uuid}.sdf" - File.open(sdf_file,"w+"){|f| f.print sdf} - sdf_file - end - - def self.parse compounds - @input_class = compounds.class.to_s - case @input_class - when "OpenTox::Compound" - @compounds = [compounds] - when "Array" - @compounds = compounds - when "OpenTox::Dataset" - @compounds = compounds.compounds - else - bad_request_error "Cannot calculate descriptors for #{compounds.class} objects." - end - end - - def self.serialize - #@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} - case @input_class - # TODO beautify and fix for other objects - when "OpenTox::Compound" - r = {} - @data_entries.first.each_with_index do |d,i| - # TODO fix @ source - r[@physchem_descriptors[i].gsub(/\./,'_')] = d - end - r - when "Array" - @data_entries - when "OpenTox::Dataset" - dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) - if @smarts - dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} - @count ? algo = "count" : algo = "match" - dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" - - elsif @physchem_descriptors - dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} - dataset.data_entries = @data_entries - dataset.feature_calculation_algorithm = "#{self}.physchem" - #TODO params? - end - dataset.save - dataset - end - end - - def self.fix_value val - val = val.first if val.is_a? Array and val.size == 1 - val = nil if val == "NaN" - if val.numeric? - val = Float(val) - val = nil if val.nan? or val.infinite? - end - val - end - private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize - end - #end -end diff --git a/lib/feature.rb b/lib/feature.rb index 21572ca..b58946b 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -10,7 +10,6 @@ module OpenTox # Feature for categorical variables class NominalFeature < Feature - # TODO check if accept_values are still needed field :accept_values, type: Array def initialize params super params @@ -35,14 +34,6 @@ module OpenTox end end - # Feature for supervised fragments from Fminer algorithm - class FminerSmarts < Smarts - field :p_value, type: Float - # TODO check if effect is used - field :effect, type: String - field :dataset_id - end - # Feature for categorical bioassay results class NominalBioAssay < NominalFeature end diff --git a/lib/lazar.rb b/lib/lazar.rb index 63257ca..0125d27 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -24,7 +24,6 @@ Mongoid.load_configuration({ } }) Mongoid.raise_not_found_error = false # return nil if no document is found -#$mongo = Mongoid.default_client $mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}") $gridfs = $mongo.database.fs @@ -57,9 +56,6 @@ suppressPackageStartupMessages({ " # Require sub-Repositories -#require_relative '../libfminer/libbbrc/bbrc' # include before openbabel -#require_relative '../libfminer/liblast/last' # -#require_relative '../last-utils/lu.rb' require_relative '../openbabel/lib/openbabel' # Fminer environment variables @@ -79,14 +75,10 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO "opentox.rb", "feature.rb", "physchem.rb", - "descriptor.rb", "compound.rb", "dataset.rb", - "descriptor.rb", "algorithm.rb", - #"bbrc.rb", "model.rb", - "similarity.rb", "classification.rb", "regression.rb", "validation.rb", diff --git a/lib/model.rb b/lib/model.rb index 8cffdfd..ebc0db3 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -163,8 +163,6 @@ module OpenTox :type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.1 - #:type => "FP4", - #:min_sim => 0.7 }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end @@ -197,7 +195,6 @@ module OpenTox include Mongoid::Document include Mongoid::Timestamps - # TODO cv -> repeated cv # TODO field Validations field :endpoint, type: String field :species, type: String diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 2287a92..cef5758 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -23,10 +23,10 @@ class Numeric end class Float - # round to significant digits + # round to n significant digits # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby - def signif(signs) - Float("%.#{signs}g" % self) + def signif(n) + Float("%.#{n}g" % self) end end diff --git a/lib/physchem.rb b/lib/physchem.rb index 64018ad..067cd59 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -37,6 +37,10 @@ module OpenTox DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + DESCRIPTORS.each do |name,description| + lib,desc = name.split('.',2) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + end require_relative "unique_descriptors.rb" diff --git a/lib/regression.rb b/lib/regression.rb index 2bf8915..e0b109e 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -23,7 +23,6 @@ module OpenTox end # TODO explicit neighbors, also for physchem - #def self.local_fingerprint_regression compound, params, method="pls", method_params="ncomp = 4" def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -129,7 +128,7 @@ module OpenTox R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # begin - R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"#, #{params}" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" rescue return nil end diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb index 6b5d602..9321a75 100644 --- a/lib/rest-client-wrapper.rb +++ b/lib/rest-client-wrapper.rb @@ -29,7 +29,6 @@ module OpenTox bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) headers[:subjectid] ||= @@subjectid bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri - #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri) # make sure that no header parameters are set in the payload [:accept,:content_type,:subjectid].each do |header| if defined? $aa || URI(uri).host == URI($aa[:uri]).host diff --git a/lib/similarity.rb b/lib/similarity.rb deleted file mode 100644 index 91e18db..0000000 --- a/lib/similarity.rb +++ /dev/null @@ -1,58 +0,0 @@ -=begin -* Name: similarity.rb -* Description: Similarity algorithms -* Author: Andreas Maunz 0 and b.size>0 - if a.size>12 && b.size>12 - a = a[0..11] - b = b[0..11] - end - a_vec = a.to_gv - b_vec = b.to_gv - val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm) - end - val - end - - end - - end -end diff --git a/lib/validation.rb b/lib/validation.rb index 9c19cde..3659341 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -102,16 +102,6 @@ module OpenTox weighted_mae = weighted_mae/confidence_sum rmse = Math.sqrt(rmse/predictions.size) weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) -=begin - update_attributes( - mae: mae, - rmse: rmse, - weighted_mae: weighted_mae, - weighted_rmse: weighted_rmse, - r_squared: r**2, - finished_at: Time.now - ) -=end { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } end end diff --git a/test/compound.rb b/test/compound.rb index 6c866b3..7342310 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -64,8 +64,7 @@ print c.sdf def test_chemblid c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H" - #assert_equal "CHEMBL277500", c.chemblid - assert_equal "CHEMBL581676", c.chemblid + assert_equal "CHEMBL277500", c.chemblid end def test_sdf_storage diff --git a/test/dataset.rb b/test/dataset.rb index 76eaf60..2f75703 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -69,7 +69,7 @@ class DatasetTest < MiniTest::Test assert_equal 3, d.compounds.size assert_equal 2, d.features.size assert_equal [[1,2],[4,5],[6,7]], d.data_entries - d.save_all + d.save # check if dataset has been saved correctly new_dataset = Dataset.find d.id assert_equal 3, new_dataset.compounds.size diff --git a/test/descriptor.rb b/test/descriptor.rb index 28be79e..d7d1385 100644 --- a/test/descriptor.rb +++ b/test/descriptor.rb @@ -4,81 +4,65 @@ class DescriptorTest < MiniTest::Test def test_list # check available descriptors - @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys - assert_equal 110,@descriptors.size,"wrong num physchem descriptors" - @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES - assert_equal 355,@descriptor_values.size,"wrong num physchem descriptors" - sum = 0 - [ @descriptors, @descriptor_values ].each do |desc| - {"Openbabel"=>15,"Cdk"=>(desc==@descriptors ? 50 : 295),"Joelib"=>45}.each do |k,v| - assert_equal v,desc.select{|x| x=~/^#{k}\./}.size,"wrong num #{k} descriptors" - sum += v - end - end - assert_equal (465),sum + assert_equal 355,PhysChem.descriptors.size,"incorrect number of physchem descriptors" + assert_equal 15,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors" + assert_equal 295,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors" + assert_equal 45,PhysChem.joelib_descriptors.size,"incorrect number of Joelib descriptors" end def test_smarts c = OpenTox::Compound.from_smiles "N=C=C1CCC(=F=FO)C1" File.open("tmp.png","w+"){|f| f.puts c.png} s = Smarts.find_or_create_by(:smarts => "F=F") - result = OpenTox::Algorithm::Descriptor.smarts_match c, s + result = c.smarts_match [s] assert_equal [1], result smarts = ["CC", "C", "C=C", "CO", "F=F", "C1CCCC1", "NN"].collect{|s| Smarts.find_or_create_by(:smarts => s)} - result = OpenTox::Algorithm::Descriptor.smarts_match c, smarts + result = c.smarts_match smarts assert_equal [1, 1, 1, 0, 1, 1, 0], result smarts_count = [10, 6, 2, 0, 2, 10, 0] - result = OpenTox::Algorithm::Descriptor.smarts_count c, smarts + result = c.smarts_match smarts, true assert_equal smarts_count, result end def test_compound_openbabel_single c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" - result = OpenTox::Algorithm::Descriptor.physchem c, ["Openbabel.logP"] - assert_equal 1.12518, result.first + result = c.physchem [PhysChem.find_or_create_by(:name => "Openbabel.logP")] + assert_equal 1.12518, result.first.last.round(5) end def test_compound_cdk_single c = OpenTox::Compound.from_smiles "c1ccccc1" - result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"] - assert_equal [12], result + result = c.physchem [PhysChem.find_or_create_by(:name => "Cdk.AtomCount.nAtom")] + assert_equal 12, result.first.last c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" - result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.AtomCount"] - assert_equal [17], result - result = OpenTox::Algorithm::Descriptor.physchem c, ["Cdk.CarbonTypes"] + result = c.physchem [PhysChem.find_or_create_by(:name => "Cdk.AtomCount.nAtom")] + assert_equal 17, result.first.last c_types = {"Cdk.CarbonTypes.C1SP1"=>1, "Cdk.CarbonTypes.C2SP1"=>0, "Cdk.CarbonTypes.C1SP2"=>0, "Cdk.CarbonTypes.C2SP2"=>1, "Cdk.CarbonTypes.C3SP2"=>0, "Cdk.CarbonTypes.C1SP3"=>2, "Cdk.CarbonTypes.C2SP3"=>1, "Cdk.CarbonTypes.C3SP3"=>1, "Cdk.CarbonTypes.C4SP3"=>0} - assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result + physchem_features = c_types.collect{|t,nr| PhysChem.find_or_create_by(:name => t)} + result = c.physchem physchem_features + assert_equal [1, 0, 0, 1, 0, 2, 1, 1, 0], result.values end def test_compound_joelib_single c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" - result = OpenTox::Algorithm::Descriptor.physchem c, ["Joelib.LogP"] - assert_equal [2.65908], result + result = c.physchem [PhysChem.find_or_create_by(:name => "Joelib.LogP")] + assert_equal 2.65908, result.first.last end def test_compound_all c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" - result = OpenTox::Algorithm::Descriptor.physchem c - assert_equal 330, result.size - assert_equal 30.8723, result[2] - assert_equal 5, result[328] - p result + result = c.physchem PhysChem.descriptors + amr = PhysChem.find_or_create_by(:name => "Cdk.ALOGP.AMR", :library => "Cdk") + sbonds = PhysChem.find_by(:name => "Openbabel.sbonds") + assert_equal 30.8723, result[amr.id.to_s] + assert_equal 5, result[sbonds.id.to_s] end def test_compound_descriptor_parameters c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" - result = OpenTox::Algorithm::Descriptor.physchem c, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ]#, true - assert_equal 12, result.size - assert_equal [1.12518, 17.0, 1, 0, 0, 1, 0, 2, 1, 1, 0, 2.65908], result#.last - end - - def test_dataset_descriptor_parameters - dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.mini.csv") - d = OpenTox::Algorithm::Descriptor.physchem dataset, [ "Openbabel.logP", "Cdk.AtomCount", "Cdk.CarbonTypes", "Joelib.LogP" ] - assert_kind_of Dataset, d - assert_equal dataset.compounds, d.compounds - assert_equal dataset.compounds.size, d.data_entries.size - assert_equal 12, d.data_entries.first.size + result = c.physchem [ "Openbabel.logP", "Cdk.AtomCount.nAtom", "Joelib.LogP" ].collect{|d| PhysChem.find_or_create_by(:name => d)} + assert_equal 3, result.size + assert_equal [1.12518, 17.0, 2.65908], result.values.collect{|v| v.round 5} end end -- cgit v1.2.3