diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/bbrc.rb | 165 | ||||
-rw-r--r-- | lib/classification.rb | 80 | ||||
-rw-r--r-- | lib/compound.rb | 148 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 117 | ||||
-rw-r--r-- | lib/dataset.rb | 131 | ||||
-rw-r--r-- | lib/descriptor.rb | 248 | ||||
-rw-r--r-- | lib/feature.rb | 17 | ||||
-rw-r--r-- | lib/lazar.rb | 45 | ||||
-rw-r--r-- | lib/leave-one-out-validation.rb | 205 | ||||
-rw-r--r-- | lib/model.rb | 104 | ||||
-rw-r--r-- | lib/overwrite.rb | 8 | ||||
-rw-r--r-- | lib/physchem.rb | 133 | ||||
-rw-r--r-- | lib/regression.rb | 337 | ||||
-rw-r--r-- | lib/rest-client-wrapper.rb | 9 | ||||
-rw-r--r-- | lib/similarity.rb | 58 | ||||
-rw-r--r-- | lib/unique_descriptors.rb | 11 | ||||
-rw-r--r-- | lib/validation.rb | 48 |
17 files changed, 762 insertions, 1102 deletions
diff --git a/lib/bbrc.rb b/lib/bbrc.rb deleted file mode 100644 index c83b9b3..0000000 --- a/lib/bbrc.rb +++ /dev/null @@ -1,165 +0,0 @@ -module OpenTox - module Algorithm - class Fminer - TABLE_OF_ELEMENTS = [ -"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"] - - # - # Run bbrc algorithm on dataset - # - # @param [OpenTox::Dataset] training dataset - # @param [optional] parameters BBRC parameters, accepted parameters are - # - min_frequency Minimum frequency (default 5) - # - feature_type Feature type, can be 'paths' or 'trees' (default "trees") - # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true") - # - min_chisq_significance Significance threshold (between 0 and 1) - # - nr_hits Set to "true" to get hit count instead of presence - # - get_target Set to "true" to obtain target variable as feature - # @return [OpenTox::Dataset] Fminer Dataset - def self.bbrc training_dataset, params={} - - time = Time.now - bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 - - prediction_feature = training_dataset.features.first - if params[:min_frequency] - minfreq = params[:min_frequency] - else - per_mil = 5 # value from latest version - per_mil = 8 # as suggested below - i = training_dataset.feature_ids.index prediction_feature.id - nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size - minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST - minfreq = 2 unless minfreq > 2 - minfreq = minfreq.round - end - - @bbrc ||= Bbrc::Bbrc.new - @bbrc.Reset - if prediction_feature.numeric - @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! - else - bad_request_error "No accept values for "\ - "dataset '#{training_dataset.id}' and "\ - "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values - value2act = Hash[[*prediction_feature.accept_values.map.with_index]] - end - @bbrc.SetMinfreq(minfreq) - @bbrc.SetType(1) if params[:feature_type] == "paths" - @bbrc.SetBackbone(false) if params[:backbone] == "false" - @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] - @bbrc.SetConsoleOut(false) - - params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false - feature_dataset = FminerDataset.new( - :training_dataset_id => training_dataset.id, - :training_algorithm => "#{self.to_s}.bbrc", - :training_feature_id => prediction_feature.id , - :training_parameters => { - :min_frequency => minfreq, - :nr_hits => nr_hits, - :backbone => (params[:backbone] == false ? false : true) - } - - ) - feature_dataset.compounds = training_dataset.compounds - - # add data - training_dataset.compounds.each_with_index do |compound,i| - act = value2act[training_dataset.data_entries[i].first] - if act # TODO check if this works - @bbrc.AddCompound(compound.smiles,i+1) - @bbrc.AddActivity(act,i+1) - end - end - #g_median=@fminer.all_activities.values.to_scale.median - - #task.progress 10 - #step_width = 80 / @bbrc.GetNoRootNodes().to_f - - $logger.debug "BBRC setup: #{Time.now-time}" - time = Time.now - ftime = 0 - itime = 0 - rtime = 0 - - # run @bbrc - (0 .. @bbrc.GetNoRootNodes()-1).each do |j| - results = @bbrc.MineRoot(j) - results.each do |result| - rt = Time.now - f = YAML.load(result)[0] - smarts = f.shift - # convert fminer SMARTS representation into a more human readable format - smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do - element = TABLE_OF_ELEMENTS[$1.to_i-1] - $2 == "a" ? element.downcase : element - end - p_value = f.shift - f.flatten! - compound_idxs = f.collect{|e| e.first.first-1} - # majority class - effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode - -=begin - if (!@bbrc.GetRegression) - id_arrs = f[2..-1].flatten - max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc - effect = max+1 - else #regression part - id_arrs = f[2] - # DV: effect calculation - f_arr=Array.new - f[2].each do |id| - id=id.keys[0] # extract id from hit count hash - f_arr.push(@fminer.all_activities[id]) - end - f_median=f_arr.to_scale.median - if g_median >= f_median - effect = 'activating' - else - effect = 'deactivating' - end - end -=end - rtime += Time.now - rt - - ft = Time.now - feature = OpenTox::FminerSmarts.find_or_create_by({ - "smarts" => smarts, - "p_value" => p_value.to_f.abs.round(5), - "effect" => effect, - "dataset_id" => feature_dataset.id - }) - feature_dataset.feature_ids << feature.id - ftime += Time.now - ft - - it = Time.now - f.each do |id_count_hash| - id_count_hash.each do |id,count| - nr_hits ? count = count.to_i : count = 1 - feature_dataset.data_entries[id-1] ||= [] - feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count - end - end - itime += Time.now - it - - end - end - - $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})" - time = Time.now - - feature_dataset.fill_nil_with 0 - - $logger.debug "Prepare save: #{Time.now-time}" - time = Time.now - feature_dataset.save_all - - $logger.debug "Save: #{Time.now-time}" - feature_dataset - - end - end - end -end diff --git a/lib/classification.rb b/lib/classification.rb index b4b2e59..0202940 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -5,14 +5,12 @@ module OpenTox def self.weighted_majority_vote compound, params neighbors = params[:neighbors] - return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty? weighted_sum = {} sim_sum = 0.0 confidence = 0.0 neighbors.each do |row| - n,sim,acts = row - #confidence = sim if sim > confidence # distance to nearest neighbor - acts.each do |act| + sim = row["tanimoto"] + row["features"][params[:prediction_feature_id].to_s].each do |act| weighted_sum[act] ||= 0 weighted_sum[act] += sim end @@ -30,81 +28,7 @@ module OpenTox bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" end end - - # Classification with majority vote from neighbors weighted by similarity - # @param [Hash] params Keys `:activities, :sims, :value_map` are required - # @return [Numeric] A prediction value. - def self.fminer_weighted_majority_vote neighbors, training_dataset - - neighbor_contribution = 0.0 - confidence_sum = 0.0 - - $logger.debug "Weighted Majority Vote Classification." - - values = neighbors.collect{|n| n[2]}.uniq - neighbors.each do |neighbor| - i = training_dataset.compound_ids.index n.id - neighbor_weight = neighbor[1] - activity = values.index(neighbor[2]) + 1 # map values to integers > 1 - neighbor_contribution += activity * neighbor_weight - if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true - case activity - when 1 - confidence_sum -= neighbor_weight - when 2 - confidence_sum += neighbor_weight - end - else - confidence_sum += neighbor_weight - end - end - if values.size == 2 - if confidence_sum >= 0.0 - prediction = values[1] - elsif confidence_sum < 0.0 - prediction = values[0] - end - elsif values.size == 1 # all neighbors have the same value - prediction = values[0] - else - prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction - end - - confidence = (confidence_sum/neighbors.size).abs - {:value => prediction, :confidence => confidence.abs} - end - - # Local support vector regression from neighbors - # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required - # @return [Numeric] A prediction value. - def self.local_svm_classification(params) - - confidence = 0.0 - prediction = nil - - $logger.debug "Local SVM." - if params[:activities].size>0 - if params[:props] - n_prop = params[:props][0].collect.to_a - q_prop = params[:props][1].collect.to_a - props = [ n_prop, q_prop ] - end - activities = params[:activities].collect.to_a - activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification - prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting - prediction = prediction.sub(/Val/,"") if prediction # Convert back - confidence = 0.0 if prediction.nil? - #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." - confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) - end - {:value => prediction, :confidence => confidence} - - end - - - end - end end diff --git a/lib/compound.rb b/lib/compound.rb index 4d32e24..e002305 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,12 +1,9 @@ -# TODO: check -# *** Open Babel Error in ParseFile -# Could not find contribution data file. - CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox class Compound + require_relative "unique_descriptors.rb" include OpenTox DEFAULT_FINGERPRINT = "MP2D" @@ -15,34 +12,35 @@ module OpenTox field :smiles, type: String field :inchikey, type: String field :names, type: Array - field :warning, type: String field :cid, type: String field :chemblid, type: String field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId - field :molecular_weight, type: Float field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer + field :physchem_descriptors, type: Hash, default: {} + field :dataset_ids, type: Array, default: [] + field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params compound = self.find_or_initialize_by params - compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT) + compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size compound.save compound end - def fingerprint type="MP2D" + def fingerprint type=DEFAULT_FINGERPRINT unless fingerprints[type] return [] unless self.smiles #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format if type == "MP2D" fp = obconversion(smiles,"smi","mpd").strip.split("\t") name = fp.shift # remove Title - fingerprints[type] = fp + fingerprints[type] = fp.uniq # no fingerprint counts #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html elsif type== "MNA" level = 2 # TODO: level as parameter, evaluate level 1, see paper @@ -82,19 +80,60 @@ module OpenTox fingerprints[type] end + def physchem descriptors=PhysChem.openbabel_descriptors + # TODO: speedup java descriptors + calculated_ids = physchem_descriptors.keys + # BSON::ObjectId instances are not allowed as keys in a BSON document. + new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids + descs = {} + algos = {} + new_ids.each do |id| + descriptor = PhysChem.find id + descs[[descriptor.library, descriptor.descriptor]] = descriptor + algos[descriptor.name] = descriptor + end + # avoid recalculating Cdk features with multiple values + descs.keys.uniq.each do |k| + descs[k].send(k[0].downcase,k[1],self).each do |n,v| + physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + end + end + save + physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + end + + def smarts_match smarts, count=false + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('smi') + obconversion.read_string(obmol,self.smiles) + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts.collect do |sma| + smarts_pattern.init(sma.smarts) + if smarts_pattern.match(obmol) + count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 + else + value = 0 + end + value + end + end + # Create a compound from smiles string # @example # compound = OpenTox::Compound.from_smiles("c1ccccc1") # @param [String] smiles Smiles string # @return [OpenTox::Compound] Compound def self.from_smiles smiles - return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces." + return nil + end smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) if smiles.empty? + $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string." return nil - #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.") else - #Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) Compound.find_or_create_by :smiles => smiles end end @@ -109,7 +148,7 @@ module OpenTox #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") if smiles.empty? - Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.") + Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."]) else Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) end @@ -245,34 +284,19 @@ module OpenTox def fingerprint_neighbors params bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] neighbors = [] - #if params[:type] == DEFAULT_FINGERPRINT - #neighbors = db_neighbors params - #p neighbors - #else + if params[:type] == DEFAULT_FINGERPRINT + neighbors = db_neighbors params + else query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| - unless self == compound - candidate_fingerprint = compound.fingerprint params[:type] - sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << [compound.id, sim] if sim >= params[:min_sim] - end - end - #end - neighbors.sort{|a,b| b.last <=> a.last} - end - - def fminer_neighbors params - bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim] - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features) - neighbors = [] - - # find neighbors - feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| - sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint - if sim >= params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + training_dataset = Dataset.find(params[:training_dataset_id]) + prediction_feature = training_dataset.features.first + training_dataset.compounds.each do |compound| + candidate_fingerprint = compound.fingerprint params[:type] + sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f + feature_values = training_dataset.values(compound,prediction_feature) + neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] end + neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end neighbors end @@ -285,13 +309,7 @@ module OpenTox # TODO implement pearson and cosine similarity separatly R.assign "x", query_fingerprint R.assign "y", candidate_fingerprint - # pearson r - #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby - #p "pearson" - #p sim - #p "cosine" sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first - #p sim if sim >= params[:min_sim] neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming end @@ -300,53 +318,53 @@ module OpenTox end def db_neighbors params - p "DB NEIGHBORS" - p params - # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb - qn = fingerprint(params[:type]).size + + #qn = default_fingerprint_size #qmin = qn * threshold #qmax = qn / threshold #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] aggregate = [ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, - {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self + #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self {'$project' => { 'tanimoto' => {'$let' => { - 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}}, - 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]} + 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, + #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}}, + 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, - '_id' => 1 + '_id' => 1, + 'features' => 1, + 'dataset_ids' => 1 }}, {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, {'$sort' => {'tanimoto' => -1}} ] - $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } + $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} end - # Get mg from mmol - # @return [Float] value in mg - def mmol_to_mg mmol - mmol.to_f*molecular_weight + # Convert mmol to mg + # @return [Float] value in mg + def mmol_to_mg mmol + mmol.to_f*molecular_weight end - def mg_to_mmol mg - mg.to_f/molecular_weight + # Convert mg to mmol + # @return [Float] value in mg + def mg_to_mmol mg + mg.to_f/molecular_weight end # Calculate molecular weight of Compound with OB and store it in object # @return [Float] molecular weight def molecular_weight - if self["molecular_weight"]==0.0 || self["molecular_weight"].nil? - update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first) - end - self["molecular_weight"] + mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") + physchem([mw_feature])[mw_feature.id.to_s] end - private def self.obconversion(identifier,input_format,output_format,option=nil) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 2e6dabb..15dfb21 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -35,14 +35,14 @@ module OpenTox predictions = [] training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| - fork do # parallel execution of validations + #fork do # parallel execution of validations $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - end + #end end - Process.waitall + #Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances @@ -52,7 +52,7 @@ module OpenTox cv.update_attributes( nr_instances: nr_instances, nr_unpredicted: nr_unpredicted, - predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence ) $logger.debug "Nr unpredicted: #{nr_unpredicted}" cv.statistics @@ -79,23 +79,26 @@ module OpenTox true_rate = {} predictivity = {} predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction and confidence.numeric? - if prediction == activity - if prediction == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += confidence - end - elsif prediction != activity - if prediction == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += confidence + compound_id,activities,prediction,confidence = pred + if activities and prediction #and confidence.numeric? + if activities.uniq.size == 1 + activity = activities.uniq.first + if prediction == activity + if prediction == accept_values[0] + confusion_matrix[0][0] += 1 + #weighted_confusion_matrix[0][0] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][1] += 1 + #weighted_confusion_matrix[1][1] += confidence + end + elsif prediction != activity + if prediction == accept_values[0] + confusion_matrix[0][1] += 1 + #weighted_confusion_matrix[0][1] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][0] += 1 + #weighted_confusion_matrix[1][0] += confidence + end end end else @@ -109,17 +112,17 @@ module OpenTox predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f end confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c - end - end + #weighted_confusion_matrix.each do |r| + #r.each do |c| + #confidence_sum += c + #end + #end update_attributes( accept_values: accept_values, confusion_matrix: confusion_matrix, - weighted_confusion_matrix: weighted_confusion_matrix, + #weighted_confusion_matrix: weighted_confusion_matrix, accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, true_rate: true_rate, predictivity: predictivity, finished_at: Time.now @@ -129,14 +132,14 @@ module OpenTox def confidence_plot unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.svg" + tmpfile = "/tmp/#{id.to_s}_confidence.png" accuracies = [] confidences = [] correct_predictions = 0 incorrect_predictions = 0 predictions.each do |p| if p[1] and p[2] - p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 + p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f confidences << p[3] @@ -146,7 +149,7 @@ module OpenTox R.assign "confidence", confidences R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") plot_id = $gridfs.insert_one(file) update(:confidence_plot_id => plot_id) end @@ -162,52 +165,46 @@ module OpenTox field :rmse, type: Float field :mae, type: Float - field :weighted_rmse, type: Float - field :weighted_mae, type: Float field :r_squared, type: Float field :correlation_plot_id, type: BSON::ObjectId - field :confidence_plot_id, type: BSON::ObjectId def statistics rmse = 0 - weighted_rmse = 0 - rse = 0 - weighted_rse = 0 mae = 0 - weighted_mae = 0 - rae = 0 - weighted_rae = 0 - confidence_sum = 0 + x = [] + y = [] predictions.each do |pred| compound_id,activity,prediction,confidence = pred - if activity and prediction - error = Math.log10(prediction)-Math.log10(activity) - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - confidence_sum += confidence + if activity and prediction + unless activity == [nil] + x << -Math.log10(activity.median) + y << -Math.log10(prediction) + error = Math.log10(prediction)-Math.log10(activity.median) + rmse += error**2 + #weighted_rmse += confidence*error**2 + mae += error.abs + #weighted_mae += confidence*error.abs + #confidence_sum += confidence + end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." end end - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + R.eval "r <- cor(measurement,prediction,use='complete')" r = R.eval("r").to_ruby mae = mae/predictions.size - weighted_mae = weighted_mae/confidence_sum + #weighted_mae = weighted_mae/confidence_sum rmse = Math.sqrt(rmse/predictions.size) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) + #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) update_attributes( mae: mae, rmse: rmse, - weighted_mae: weighted_mae, - weighted_rmse: weighted_rmse, + #weighted_mae: weighted_mae, + #weighted_rmse: weighted_rmse, r_squared: r**2, finished_at: Time.now ) @@ -243,11 +240,11 @@ module OpenTox :neighbors => neighbors } end - end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1] + end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] end def confidence_plot - tmpfile = "/tmp/#{id.to_s}_confidence.svg" + tmpfile = "/tmp/#{id.to_s}_confidence.png" sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact R.assign "error", sorted_predictions.collect{|p| p[0]} R.assign "confidence", sorted_predictions.collect{|p| p[1]} @@ -255,7 +252,7 @@ module OpenTox R.eval "image = qplot(confidence,error)" R.eval "image = image + stat_smooth(method='lm', se=FALSE)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") plot_id = $gridfs.insert_one(file) update(:confidence_plot_id => plot_id) $gridfs.find_one(_id: confidence_plot_id).data @@ -263,7 +260,7 @@ module OpenTox def correlation_plot unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.svg" + tmpfile = "/tmp/#{id.to_s}_correlation.png" x = predictions.collect{|p| p[1]} y = predictions.collect{|p| p[2]} attributes = Model::Lazar.find(self.model_id).attributes @@ -276,7 +273,7 @@ module OpenTox R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) end diff --git a/lib/dataset.rb b/lib/dataset.rb index 4da8fd6..5d8aeaf 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,25 +5,12 @@ module OpenTox class Dataset - #attr_writer :data_entries - # associations like has_many, belongs_to deteriorate performance field :feature_ids, type: Array, default: [] field :compound_ids, type: Array, default: [] - #field :data_entries_id, type: BSON::ObjectId field :data_entries, type: Array, default: [] field :source, type: String - # Save all data including data_entries - # Should be used instead of save - def save_all - save - #dump = Marshal.dump(@data_entries) - #file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries") - #entries_id = $gridfs.insert_one(file) - #update(:data_entries_id => entries_id) - end - # Readers # Get all compounds @@ -38,33 +25,6 @@ module OpenTox @features end -=begin - # Get all data_entries - def data_entries - unless @data_entries - t = Time.now - data_entry_file = $gridfs.find_one(_id: data_entries_id) - if data_entry_file.nil? - @data_entries = [] - else - @data_entries = Marshal.load(data_entry_file.data) - bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array - unless @data_entries.first.size == feature_ids.size - # TODO: fix (unknown) source of empty data_entries - sleep 1 - data_entry_file = $gridfs.find_one(_id: data_entries_id) - @data_entries = Marshal.load(data_entry_file.data) - end - bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size - # TODO: data_entries can be empty, poorly reproducible, mongo problem? - bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size - #$logger.debug "Retrieving data: #{Time.now-t}" - end - end - @data_entries - end -=end - # Find data entry values for a given compound and feature # @param compound [OpenTox::Compound] OpenTox Compound object # @param feature [OpenTox::Feature] OpenTox Feature object @@ -93,7 +53,13 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - len = self.compound_ids.size + unique_compound_data = {} + compound_ids.each_with_index do |cid,i| + unique_compound_data[cid] ||= [] + unique_compound_data[cid] << data_entries[i] + end + unique_compound_ids = unique_compound_data.keys + len = unique_compound_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -102,22 +68,44 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| self.compound_ids[i]} - test_data_entries = test_idxs.collect{|i| self.data_entries[i]} - test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) + test_cids = test_idxs.collect{|i| unique_compound_ids[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| self.compound_ids[i]} - training_data_entries = training_idxs.collect{|i| self.data_entries[i]} - training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) - test_dataset.save_all - training_dataset.save_all - chunks << [training_dataset,test_dataset] + training_cids = training_idxs.collect{|i| unique_compound_ids[i]} + chunk = [training_cids,test_cids].collect do |unique_cids| + cids = [] + data_entries = [] + unique_cids.each do |cid| + unique_compound_data[cid].each do |de| + cids << cid + data_entries << de + end + end + dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) + dataset.compounds.each do |compound| + compound.dataset_ids << dataset.id + compound.save + end + dataset.save + dataset + end start = last+1 + chunks << chunk end chunks end # Diagnostics + + def duplicates feature=self.features.first + col = feature_ids.index feature.id + dups = {} + compound_ids.each_with_index do |cid,i| + rows = compound_ids.each_index.select{|r| compound_ids[r] == cid } + values = rows.collect{|row| data_entries[row][col]} + dups[cid] = values if values.size > 1 + end + dups + end def correlation_plot training_dataset # TODO: create/store svg @@ -145,7 +133,6 @@ module OpenTox end end - # Parsers # Create a dataset from file (csv,sdf,...) @@ -154,10 +141,10 @@ module OpenTox # TODO #def self.from_sdf_file #end - + # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil, bioassay=true + def self.from_csv_file file, source=nil, bioassay=true#, layout={} source ||= file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) @@ -167,7 +154,7 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table, bioassay + dataset.parse_table table, bioassay#, layout end dataset end @@ -224,12 +211,11 @@ module OpenTox value_time = 0 # compounds and values - #@data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)} self.data_entries = [] table.each_with_index do |vals,i| ct = Time.now - identifier = vals.shift + identifier = vals.shift.strip warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format @@ -246,7 +232,7 @@ module OpenTox warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - # TODO insert empty compounds to keep positions? + compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 @@ -263,10 +249,15 @@ module OpenTox warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] - self.data_entries.last[j] = v.to_f + v = v.to_f else - self.data_entries.last[j] = v.strip + v = v.strip end + self.data_entries.last[j] = v + #i = compound.feature_ids.index feature_ids[j] + compound.features[feature_ids[j].to_s] ||= [] + compound.features[feature_ids[j].to_s] << v + compound.save end end compounds.duplicates.each do |compound| @@ -293,28 +284,6 @@ module OpenTox end end - def scale - scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)} - centers = [] - scales = [] - feature_ids.each_with_index do |feature_id,col| - R.assign "x", data_entries.collect{|de| de[col]} - R.eval "scaled = scale(x,center=T,scale=T)" - centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby - scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby - R.eval("scaled").to_ruby.each_with_index do |value,row| - scaled_data_entries[row][col] = value - end - end - scaled_dataset = ScaledDataset.new(attributes) - scaled_dataset["_id"] = BSON::ObjectId.new - scaled_dataset["_type"] = "OpenTox::ScaledDataset" - scaled_dataset.centers = centers - scaled_dataset.scales = scales - scaled_dataset.data_entries = scaled_data_entries - scaled_dataset.save_all - scaled_dataset - end end # Dataset for lazar predictions diff --git a/lib/descriptor.rb b/lib/descriptor.rb deleted file mode 100644 index 9733bde..0000000 --- a/lib/descriptor.rb +++ /dev/null @@ -1,248 +0,0 @@ -require 'digest/md5' -ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" -# TODO store descriptors in mongodb - -module OpenTox - - module Algorithm - - # Class for descriptor calculations - class Descriptor - include OpenTox - - JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") - CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last - JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") - LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") - JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") - - obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] - OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| - name,description = d.split(/\s+/,2) - ["Openbabel."+name,description] unless obexclude.include? name - end.compact.sort{|a,b| a[0] <=> b[0]}] - - cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) - CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] - CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten - - # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) - joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] - # strip Joelib messages from stdout - JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| - name = d[:java_class].sub(/^joelib2.feature.types./,'') - # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java - ["Joelib."+name, "no description available"] unless joelibexclude.include? name - end.compact.sort{|a,b| a[0] <=> b[0]}] - - DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) - DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys - - require_relative "unique_descriptors.rb" - - # Description of available descriptors - def self.description descriptor - lib = descriptor.split('.').first - case lib - when "Openbabel" - OBDESCRIPTORS[descriptor] - when "Cdk" - name = descriptor.split('.')[0..-2].join('.') - CDKDESCRIPTORS[name] - when "Joelib" - JOELIBDESCRIPTORS[descriptor] - when "lookup" - "Read feature values from a dataset" - end - end - - # Match an array of smarts features - def self.smarts_match compounds, smarts_features, count=false - bad_request_error "Compounds for smarts_match are empty" unless compounds - bad_request_error "Smarts features for smarts_match are empty" unless smarts_features - parse compounds - @count = count - obconversion = OpenBabel::OBConversion.new - obmol = OpenBabel::OBMol.new - obconversion.set_in_format('smi') - smarts_pattern = OpenBabel::OBSmartsPattern.new - smarts_features = [smarts_features] if smarts_features.is_a?(Feature) - @smarts = smarts_features.collect{|f| f.smarts} - @physchem_descriptors = nil - @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)} - @compounds.each_with_index do |compound,c| - obconversion.read_string(obmol,compound.smiles) - @smarts.each_with_index do |smart,s| - smarts_pattern.init(smart) - if smarts_pattern.match(obmol) - count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 - else - value = 0 - end - @data_entries[c][s] = value - end - end - serialize - end - - # Count matches of an array with smarts features - def self.smarts_count compounds, smarts - # TODO: non-overlapping matches? - smarts_match compounds,smarts,true - end - - # Calculate physchem descriptors - # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset - def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS - parse compounds - @data_entries = Array.new(@compounds.size){[]} - @descriptors = descriptors - @smarts = nil - @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features - des = {} - @descriptors.each do |d| - lib, descriptor = d.split(".",2) - lib = lib.downcase.to_sym - des[lib] ||= [] - des[lib] << descriptor - end - des.each do |lib,descriptors| - p lib, descriptors - send(lib, descriptors) - end - serialize - end - - def self.openbabel descriptors - $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds" - obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format 'smi' - last_feature_idx = @physchem_descriptors.size - @compounds.each_with_index do |compound,c| - obconversion.read_string obmol, compound.smiles - obdescriptors.each_with_index do |descriptor,d| - @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) - end - end - @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"} - end - - def self.java_descriptors descriptors, lib - $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds" - sdf = sdf_3d - # use java system call (rjb blocks within tasks) - # use Tempfiles to avoid "Argument list too long" error - case lib - when "cdk" - run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}" - when "joelib" - run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}" - end - last_feature_idx = @physchem_descriptors.size - YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i| - # TODO create warnings - #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty? - # CDK Descriptors may calculate multiple values, they are stored in separate features - @physchem_descriptors += calculation.keys if i == 0 - calculation.keys.each_with_index do |name,j| - @data_entries[i][j+last_feature_idx] = fix_value(calculation[name]) - end - end - FileUtils.rm "#{sdf}#{lib}.yaml" - end - - def self.cdk descriptors - java_descriptors descriptors, "cdk" - end - - def self.joelib descriptors - java_descriptors descriptors, "joelib" - end - - def self.lookup compounds, features, dataset - parse compounds - fingerprint = [] - compounds.each do |compound| - fingerprint << [] - features.each do |feature| - end - end - end - - def self.run_cmd cmd - cmd = "#{cmd} 2>&1" - $logger.debug "running external cmd: '#{cmd}'" - p = IO.popen(cmd) do |io| - while line = io.gets - $logger.debug "> #{line.chomp}" - end - io.close - raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0 - end - end - - def self.sdf_3d - # TODO check if 3d sdfs are stored in GridFS - sdf = "" - @compounds.each do |compound| - sdf << compound.sdf - end - sdf_file = "/tmp/#{SecureRandom.uuid}.sdf" - File.open(sdf_file,"w+"){|f| f.print sdf} - sdf_file - end - - def self.parse compounds - @input_class = compounds.class.to_s - case @input_class - when "OpenTox::Compound" - @compounds = [compounds] - when "Array" - @compounds = compounds - when "OpenTox::Dataset" - @compounds = compounds.compounds - else - bad_request_error "Cannot calculate descriptors for #{compounds.class} objects." - end - end - - def self.serialize - @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} - case @input_class - when "OpenTox::Compound" - @data_entries.first - when "Array" - @data_entries - when "OpenTox::Dataset" - dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) - if @smarts - dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} - @count ? algo = "count" : algo = "match" - dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" - - elsif @physchem_descriptors - dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} - dataset.data_entries = @data_entries - dataset.feature_calculation_algorithm = "#{self}.physchem" - #TODO params? - end - dataset.save_all - dataset - end - end - - def self.fix_value val - val = val.first if val.is_a? Array and val.size == 1 - val = nil if val == "NaN" - if val.numeric? - val = Float(val) - val = nil if val.nan? or val.infinite? - end - val - end - private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize - end - end -end diff --git a/lib/feature.rb b/lib/feature.rb index a308a55..b58946b 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -5,11 +5,11 @@ module OpenTox field :nominal, type: Boolean field :numeric, type: Boolean field :measured, type: Boolean + field :calculated, type: Boolean end # Feature for categorical variables class NominalFeature < Feature - # TODO check if accept_values are still needed field :accept_values, type: Array def initialize params super params @@ -34,21 +34,6 @@ module OpenTox end end - # Feature for supervised fragments from Fminer algorithm - class FminerSmarts < Smarts - field :p_value, type: Float - # TODO check if effect is used - field :effect, type: String - field :dataset_id - end - - # Feature for physico-chemical descriptors - class PhysChemDescriptor < NumericFeature - field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem" - field :parameters, type: Hash - field :creator, type: String - end - # Feature for categorical bioassay results class NominalBioAssay < NominalFeature end diff --git a/lib/lazar.rb b/lib/lazar.rb index cc66841..a28ba3a 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -8,6 +8,7 @@ require 'mongoid' require 'rserve' require "nokogiri" require "base64" +require 'openbabel' # Environment setup ENV["LAZAR_ENV"] ||= "production" @@ -24,7 +25,6 @@ Mongoid.load_configuration({ } }) Mongoid.raise_not_found_error = false # return nil if no document is found -#$mongo = Mongoid.default_client $mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}") $gridfs = $mongo.database.fs @@ -41,26 +41,27 @@ when "development" end # R setup +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) +# should work on POSIX including os x +# http://stackoverflow.com/questions/19619582/number-of-processors-cores-in-command-line +NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i R = Rserve::Connection.new -R.eval "library(ggplot2)" -R.eval "library(grid)" -R.eval "library(gridExtra)" - -# Require sub-Repositories -require_relative '../libfminer/libbbrc/bbrc' # include before openbabel -require_relative '../libfminer/liblast/last' # -require_relative '../last-utils/lu.rb' -require_relative '../openbabel/lib/openbabel' - -# Fminer environment variables -ENV['FMINER_SMARTS'] = 'true' -ENV['FMINER_NO_AROMATIC'] = 'true' -ENV['FMINER_PVALUES'] = 'true' -ENV['FMINER_SILENT'] = 'true' -ENV['FMINER_NR_HITS'] = 'true' +R.eval " +suppressPackageStartupMessages({ + library(iterators,lib=\"#{rlib}\") + library(foreach,lib=\"#{rlib}\") + library(ggplot2,lib=\"#{rlib}\") + library(grid,lib=\"#{rlib}\") + library(gridExtra,lib=\"#{rlib}\") + library(pls,lib=\"#{rlib}\") + library(caret,lib=\"#{rlib}\") + library(doMC,lib=\"#{rlib}\") + registerDoMC(#{NR_CORES}) +}) +" # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -68,18 +69,16 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat "error.rb", "opentox.rb", "feature.rb", + "physchem.rb", "compound.rb", "dataset.rb", - "descriptor.rb", "algorithm.rb", - "descriptor.rb", - "bbrc.rb", "model.rb", - "similarity.rb", "classification.rb", "regression.rb", "validation.rb", "crossvalidation.rb", + "leave-one-out-validation.rb", "experiment.rb", ].each{ |f| require_relative f } - +OpenTox::PhysChem.descriptors # load descriptor features diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb new file mode 100644 index 0000000..2cd13db --- /dev/null +++ b/lib/leave-one-out-validation.rb @@ -0,0 +1,205 @@ +module OpenTox + + class LeaveOneOutValidation + + field :model_id, type: BSON::ObjectId + field :dataset_id, type: BSON::ObjectId + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Array + field :finished_at, type: Time + + def self.create model + model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation + loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id + compound_ids = model.training_dataset.compound_ids + predictions = model.predict model.training_dataset.compounds + predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]} + predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?} + loo.nr_instances = predictions.size + predictions.select!{|p| p[:value]} # remove unpredicted + loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]} + loo.nr_unpredicted = loo.nr_instances - loo.predictions.size + loo.statistics + loo.save + loo + end + + def model + Model::Lazar.find model_id + end + end + + class ClassificationLeaveOneOutValidation < LeaveOneOutValidation + + field :accept_values, type: Array + field :confusion_matrix, type: Array, default: [] + field :weighted_confusion_matrix, type: Array, default: [] + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash, default: {} + field :predictivity, type: Hash, default: {} + field :confidence_plot_id, type: BSON::ObjectId + + def statistics + accept_values = Feature.find(model.prediction_feature_id).accept_values + confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + predictions.each do |pred| + pred[:database_activities].each do |db_act| + if pred[:value] + if pred[:value] == db_act + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:confidence] + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:confidence] + end + else + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:confidence] + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:confidence] + end + end + end + end + end + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + update_attributes( + accept_values: accept_values, + confusion_matrix: confusion_matrix, + weighted_confusion_matrix: weighted_confusion_matrix, + accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, + weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + true_rate: true_rate, + predictivity: predictivity, + finished_at: Time.now + ) + $logger.debug "Accuracy #{accuracy}" + end + + def confidence_plot + unless confidence_plot_id + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + p[:database_activities].each do |db_act| + if p[:value] + p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[:confidence] + + end + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + end + $gridfs.find_one(_id: confidence_plot_id).data + end + end + + + class RegressionLeaveOneOutValidation < LeaveOneOutValidation + + + field :rmse, type: Float, default: 0.0 + field :mae, type: Float, default: 0 + #field :weighted_rmse, type: Float, default: 0 + #field :weighted_mae, type: Float, default: 0 + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId + field :confidence_plot_id, type: BSON::ObjectId + + def statistics + confidence_sum = 0 + predicted_values = [] + measured_values = [] + predictions.each do |pred| + pred[:database_activities].each do |activity| + if pred[:value] + predicted_values << pred[:value] + measured_values << activity + error = Math.log10(pred[:value])-Math.log10(activity) + self.rmse += error**2 + #self.weighted_rmse += pred[:confidence]*error**2 + self.mae += error.abs + #self.weighted_mae += pred[:confidence]*error.abs + #confidence_sum += pred[:confidence] + end + end + if pred[:database_activities].empty? + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + R.assign "measurement", measured_values + R.assign "prediction", predicted_values + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + r = R.eval("r").to_ruby + + self.mae = self.mae/predictions.size + #self.weighted_mae = self.weighted_mae/confidence_sum + self.rmse = Math.sqrt(self.rmse/predictions.size) + #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum) + self.r_squared = r**2 + self.finished_at = Time.now + save + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + end + + def correlation_plot + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.svg" + predicted_values = [] + measured_values = [] + predictions.each do |pred| + pred[:database_activities].each do |activity| + if pred[:value] + predicted_values << pred[:value] + measured_values << activity + end + end + end + attributes = Model::Lazar.find(self.model_id).attributes + attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} + attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") + R.assign "measurement", measured_values + R.assign "prediction", predicted_values + R.eval "all = c(-log(measurement),-log(prediction))" + R.eval "range = c(min(all), max(all))" + R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end + end + +end diff --git a/lib/model.rb b/lib/model.rb index 227d4d3..8e657b8 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -34,7 +34,6 @@ module OpenTox def initialize training_dataset, params={} super params - bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 # TODO document convention prediction_feature = training_dataset.features.first @@ -48,13 +47,34 @@ module OpenTox self end - def predict object, use_database_values=true + def predict_compound compound + prediction_feature = Feature.find prediction_feature_id + neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) + # remove neighbors without prediction_feature + # check for database activities (neighbors may include query compound) + database_activities = nil + prediction = {} + if neighbors.collect{|n| n["_id"]}.include? compound.id + + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq + prediction[:database_activities] = database_activities + prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." + neighbors.delete_if{|n| n["_id"] == compound.id} + end + neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } + if neighbors.empty? + prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) + else + prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id})) + prediction[:neighbors] = neighbors + prediction[:neighbors] ||= [] + end + prediction + end - t = Time.now - at = Time.now + def predict object training_dataset = Dataset.find training_dataset_id - prediction_feature = Feature.find prediction_feature_id # parse data compounds = [] @@ -71,63 +91,33 @@ module OpenTox # make predictions predictions = [] - neighbors = [] - compounds.each_with_index do |compound,c| - t = Time.new - - neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) - # add activities - # TODO: improve efficiency, takes 3 times longer than previous version - neighbors.collect! do |n| - rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} - acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact - acts.empty? ? nil : n << acts - end - neighbors.compact! # remove neighbors without training activities - - database_activities = training_dataset.values(compound,prediction_feature) - if use_database_values and database_activities and !database_activities.empty? - database_activities = database_activities.first if database_activities.size == 1 - predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} - next - end - predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size}) -=begin -# TODO scaled dataset for physchem - p neighbor_algorithm_parameters - p (neighbor_algorithm_parameters["feature_dataset_id"]) - d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]) - p d - p d.class - if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset - p "SCALED" - end -=end - end + predictions = compounds.collect{|c| predict_compound c} # serialize result case object.class.to_s when "OpenTox::Compound" prediction = predictions.first - prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity + prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity return prediction when "Array" return predictions when "OpenTox::Dataset" # prepare prediction dataset + measurement_feature = Feature.find prediction_feature_id + + prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) prediction_dataset = LazarPrediction.new( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, :prediction_feature_id => prediction_feature.id ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" ) - # TODO move into warnings field + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" ) warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]} - prediction_dataset.save_all + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]} + prediction_dataset.save return prediction_dataset end @@ -164,15 +154,12 @@ module OpenTox def self.create training_dataset, params={} model = self.new training_dataset, params model.neighbor_algorithm ||= "fingerprint_neighbors" - model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" + model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.1 - #:type => "FP4", - #:training_dataset_id => training_dataset.id, - #:min_sim => 0.7 }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end @@ -181,31 +168,11 @@ module OpenTox end end - class LazarFminerClassification < LazarClassification - field :feature_calculation_parameters, type: Hash - - def self.create training_dataset, fminer_params={} - model = super(training_dataset) - model.update "_type" => self.to_s # adjust class - model = self.find model.id # adjust class - model.neighbor_algorithm = "fminer_neighbors" - model.neighbor_algorithm_parameters = { - :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", - :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id, - :min_sim => 0.3 - } - model.feature_calculation_parameters = fminer_params - model.save - model - end - end - class Prediction include OpenTox include Mongoid::Document include Mongoid::Timestamps - # TODO cv -> repeated cv # TODO field Validations field :endpoint, type: String field :species, type: String @@ -249,7 +216,6 @@ module OpenTox training_dataset = Dataset.from_csv_file file model = nil if training_dataset.features.first.nominal? - #model = LazarFminerClassification.create training_dataset model = LazarClassification.create training_dataset elsif training_dataset.features.first.numeric? model = LazarRegression.create training_dataset diff --git a/lib/overwrite.rb b/lib/overwrite.rb index c92ad2b..cef5758 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -22,6 +22,14 @@ class Numeric end end +class Float + # round to n significant digits + # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby + def signif(n) + Float("%.#{n}g" % self) + end +end + module Enumerable # @return [Array] only the duplicates of an enumerable def duplicates diff --git a/lib/physchem.rb b/lib/physchem.rb new file mode 100644 index 0000000..f7b880f --- /dev/null +++ b/lib/physchem.rb @@ -0,0 +1,133 @@ +module OpenTox + + # Feature for physico-chemical descriptors + class PhysChem < NumericFeature + + field :library, type: String + field :descriptor, type: String + field :description, type: String + + JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") + CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last + JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") + LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") + JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") + + obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] + OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + name,description = d.split(/\s+/,2) + ["Openbabel."+name,description] unless obexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + cdkdescriptors = {} + CDK_DESCRIPTIONS = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) + CDK_DESCRIPTIONS.each do |d| + prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'') + d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] } + end + CDKDESCRIPTORS = cdkdescriptors + + # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) + joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] + # strip Joelib messages from stdout + JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + name = d[:java_class].sub(/^joelib2.feature.types./,'') + ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + + require_relative "unique_descriptors.rb" + + def self.descriptors desc=DESCRIPTORS + desc.collect do |name,description| + lib,desc = name.split('.',2) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + end + end + + def self.unique_descriptors + udesc = [] + UNIQUEDESCRIPTORS.each do |name| + lib,desc = name.split('.',2) + if lib == "Cdk" + CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n| + dname = "#{name}.#{n}" + description = DESCRIPTORS[dname] + udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + end + else + description = DESCRIPTORS[name] + udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + end + end + udesc + end + + def self.openbabel_descriptors + descriptors OBDESCRIPTORS + end + + def self.cdk_descriptors + descriptors CDKDESCRIPTORS + end + + def self.joelib_descriptors + descriptors JOELIBDESCRIPTORS + end + + def calculate compound + result = send library.downcase,descriptor,compound + result[self.name] + end + + def openbabel descriptor, compound + obdescriptor = OpenBabel::OBDescriptor.find_type descriptor + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format 'smi' + obconversion.read_string obmol, compound.smiles + {"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))} + end + + def cdk descriptor, compound + java_descriptor "cdk", descriptor, compound + end + + def joelib descriptor, compound + java_descriptor "joelib", descriptor, compound + end + + private + + def java_descriptor lib, descriptor, compound + + sdf_3d = "/tmp/#{SecureRandom.uuid}.sdf" + File.open(sdf_3d,"w+"){|f| f.print compound.sdf} + + # use java system call (rjb blocks within tasks) + # use Tempfiles to avoid "Argument list too long" error + case lib + when "cdk" + `java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf_3d} #{descriptor}` + when "joelib" + `java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf_3d} #{descriptor}` + end + result = YAML.load_file("#{sdf_3d}#{lib}.yaml").first + result.keys.each{|k| result[k] = result.delete(k)} + result + end + + def fix_value val + val = val.first if val.is_a? Array and val.size == 1 + val = nil if val == "NaN" + if val.numeric? + val = Float(val) + val = nil if val.nan? or val.infinite? + end + val + end + + end + +end diff --git a/lib/regression.rb b/lib/regression.rb index 868c25f..5021fb3 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,262 +1,151 @@ -# TODO install R packages kernlab, caret, doMC, class, e1071 - - - # log transform activities (create new dataset) - # scale, normalize features, might not be necessary - # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is - # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression - # zero-order correlation and the semi-partial correlation - # seems to be necessary for svm - # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 - # http://stackoverflow.com/questions/15436367/svm-scaling-input-values - # use lasso or elastic net?? - # select relevant features - # remove features with a single value - # remove correlated features - # remove features not correlated with endpoint module OpenTox module Algorithm class Regression - def self.weighted_average compound, params + def self.local_weighted_average compound, params weighted_sum = 0.0 sim_sum = 0.0 - confidence = 0.0 neighbors = params[:neighbors] - activities = [] neighbors.each do |row| - n,sim,acts = row - confidence = sim if sim > confidence # distance to nearest neighbor - # TODO add LOO errors - acts.each do |act| - weighted_sum += sim*Math.log10(act) - activities << act - sim_sum += sim + sim = row["tanimoto"] + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + weighted_sum += sim*Math.log10(act) + sim_sum += sim + end end end - #R.assign "activities", activities - #R.eval "cv = cv(activities)" - #confidence /= activities.standard_deviation#/activities.mean - #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size] - #confidence = sim_sum/neighbors.size.to_f - #confidence = neighbors.size.to_f - confidence = 0 if confidence.nan? sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) - {:value => prediction,:confidence => confidence} + {:value => prediction} end - def self.local_linear_regression compound, neighbors - p neighbors.size - return nil unless neighbors.size > 0 - features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq - p features - training_data = Array.new(neighbors.size){Array.new(features.size,0)} - neighbors.each_with_index do |n,i| - #p n.first - neighbor = Compound.find n.first - features.each_with_index do |f,j| - training_data[i][j] = 1 if neighbor.fp4.include? f + # TODO explicit neighbors, also for physchem + def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + activities = [] + fingerprints = {} + weights = [] + fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + fingerprint = neighbor.fingerprint + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] + fingerprint_ids.each_with_index do |id,j| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) + end + end end end - p training_data - - R.assign "activities", neighbors.collect{|n| n[2].median} - R.assign "features", training_data - R.eval "model <- lm(activities ~ features)" - R.eval "summary <- summary(model)" - p R.summary - compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0} - R.assign "compound_features", compound_features - R.eval "prediction <- predict(model,compound_features)" - p R.prediction - - end - def self.weighted_average_with_relevant_fingerprints neighbors - weighted_sum = 0.0 - sim_sum = 0.0 - fingerprint_features = [] - neighbors.each do |row| - n,sim,acts = row - neighbor = Compound.find n - fingerprint_features += neighbor.fp4 - end - fingerprint_features.uniq! - p fingerprint_features -=begin - p n - acts.each do |act| - weighted_sum += sim*Math.log10(act) - sim_sum += sim + variables = [] + data_frame = [activities] + fingerprints.each do |k,v| + unless v.uniq.size == 1 + data_frame << v.collect{|m| m ? "T" : "F"} + variables << k end end -=end - confidence = sim_sum/neighbors.size.to_f - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) - {:value => prediction,:confidence => confidence} - end - - # Local support vector regression from neighbors - # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required - # @return [Numeric] A prediction value. - def self.local_svm_regression neighbors, params={:min_train_performance => 0.1} - confidence = 0.0 - prediction = nil + if variables.empty? + result = local_weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result - $logger.debug "Local SVM." - props = neighbors.collect{|row| row[3] } - neighbors.shift - activities = neighbors.collect{|n| n[2]} - prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting - prediction = nil if (!prediction.nil? && prediction.infinite?) - $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')." - if prediction - confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities}) else - confidence = nil if prediction.nil? + compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} + prediction = r_model_prediction method, data_frame, variables, weights, compound_features + if prediction.nil? or prediction[:value].nil? + prediction = local_weighted_average(compound, params) + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return prediction + else + prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] + prediction[:value] = 10**prediction[:value] + prediction[:rmse] = 10**prediction[:rmse] + prediction + end end - [prediction, confidence] - + end + def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" - # Local support vector prediction from neighbors. - # Uses propositionalized setting. - # Not to be called directly (use local_svm_regression or local_svm_classification). - # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] - # @param [Array] activities, activities for neighbors. - # @param [Float] min_train_performance, parameter to control censoring - # @return [Numeric] A prediction value. - def self.local_svm_prop(props, activities, min_train_performance) - - $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)." - n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays. - q_prop = props[0] # is an Array. + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 - prediction = nil - if activities.uniq.size == 1 - prediction = activities[0] - else - t = Time.now - #$logger.debug gram_matrix.to_yaml - #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests - @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests - rs = [] - ["caret", "doMC", "class"].each do |lib| - #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))" - rs << "suppressPackageStartupMessages(library('#{lib}'))" + activities = [] + weights = [] + physchem = {} + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + if row["features"][params[:prediction_feature_id].to_s] + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] # TODO cosine ? + neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + physchem[pid] ||= [] + physchem[pid] << v + end + end end - #@r.eval "registerDoMC()" # switch on parallel processing - rs << "registerDoMC()" # switch on parallel processing - #@r.eval "set.seed(1)" - rs << "set.seed(1)" - $logger.debug "Loading R packages: #{Time.now-t}" - t = Time.now - p n_prop - begin - - # set data - rs << "n_prop <- c(#{n_prop.flatten.join(',')})" - rs << "n_prop <- c(#{n_prop.flatten.join(',')})" - rs << "n_prop_x_size <- c(#{n_prop.size})" - rs << "n_prop_y_size <- c(#{n_prop[0].size})" - rs << "y <- c(#{activities.join(',')})" - rs << "q_prop <- c(#{q_prop.join(',')})" - rs << "y = matrix(y)" - rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" - rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)" - - $logger.debug "Setting R data: #{Time.now-t}" - t = Time.now - # prepare data - rs << " - weights=NULL - if (!(class(y) == 'numeric')) { - y = factor(y) - weights=unlist(as.list(prop.table(table(y)))) - weights=(weights-1)^2 - } - " - - rs << " - rem = nearZeroVar(prop_matrix) - if (length(rem) > 0) { - prop_matrix = prop_matrix[,-rem,drop=F] - q_prop = q_prop[,-rem,drop=F] - } - rem = findCorrelation(cor(prop_matrix)) - if (length(rem) > 0) { - prop_matrix = prop_matrix[,-rem,drop=F] - q_prop = q_prop[,-rem,drop=F] - } - " + end - #p @r.eval("y").to_ruby - #p "weights" - #p @r.eval("weights").to_ruby - $logger.debug "Preparing R data: #{Time.now-t}" - t = Time.now - # model + support vectors - #train_success = @r.eval <<-EOR - rs << ' - model = train(prop_matrix,y, - method="svmRadial", - preProcess=c("center", "scale"), - class.weights=weights, - trControl=trainControl(method="LGOCV",number=10), - tuneLength=8 - ) - perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) - ' - File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")} - p rs.join("\n") - p `Rscript /tmp/r.r` -=begin - @r.void_eval <<-EOR - model = train(prop_matrix,y, - method="svmRadial", - #preProcess=c("center", "scale"), - #class.weights=weights, - #trControl=trainControl(method="LGOCV",number=10), - #tuneLength=8 - ) - perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) - EOR -=end + # remove properties with a single value + physchem.each do |pid,v| + physchem.delete(pid) if v.uniq.size <= 1 + end - $logger.debug "Creating R SVM model: #{Time.now-t}" - t = Time.now - if train_success - # prediction - @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice - #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice - @r.eval "if (class(y)!='numeric') p = as.character(p)" - prediction = @r.p + if physchem.empty? + result = local_weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result - # censoring - prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) - prediction = nil if prediction =~ /NA/ - $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" - else - $logger.debug "Model creation failed." - prediction = nil - end - $logger.debug "R Prediction: #{Time.now-t}" - rescue Exception => e - $logger.debug "#{e.class}: #{e.message}" - $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - ensure - #puts @r.inspect - #TODO: broken pipe - #@r.quit # free R + else + data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + if prediction.nil? + prediction = local_weighted_average(compound, params) + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return prediction + else + prediction[:value] = 10**prediction[:value] + prediction end end - prediction + end - end + def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values + R.assign "weights", training_weights + r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", training_features + R.eval "names(data) <- append(c('activities'),features)" # + begin + R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" + rescue + return nil + end + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + R.eval "names(fingerprint) <- features" + R.eval "prediction <- predict(model,fingerprint)" + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } + end + + end end end diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb index de1b74f..9321a75 100644 --- a/lib/rest-client-wrapper.rb +++ b/lib/rest-client-wrapper.rb @@ -26,15 +26,14 @@ module OpenTox define_singleton_method method do |uri,payload={},headers={},waiting_task=nil| # check input - bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash) + bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) headers[:subjectid] ||= @@subjectid - bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri - #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri) + bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri # make sure that no header parameters are set in the payload [:accept,:content_type,:subjectid].each do |header| if defined? $aa || URI(uri).host == URI($aa[:uri]).host else - bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header] + bad_request_error "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header] end end @@ -72,7 +71,7 @@ module OpenTox msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}" cause = nil end - Object.method(error[:method]).call msg, uri, cause # call error method + Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method else response end diff --git a/lib/similarity.rb b/lib/similarity.rb deleted file mode 100644 index 91e18db..0000000 --- a/lib/similarity.rb +++ /dev/null @@ -1,58 +0,0 @@ -=begin -* Name: similarity.rb -* Description: Similarity algorithms -* Author: Andreas Maunz <andreas@maunz.de -* Date: 10/2012 -=end - -module OpenTox - module Algorithm - - class Similarity - - #TODO weighted tanimoto - - # Tanimoto similarity - # @param [Array] a fingerprints of first compound - # @param [Array] b fingerprints of second compound - # @return [Float] Tanimoto similarity - def self.tanimoto(a,b) - bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size - #common = 0.0 - #a.each_with_index do |n,i| - #common += 1 if n == b[i] - #end - #common/a.size - # TODO check if calculation speed can be improved - common_p_sum = 0.0 - all_p_sum = 0.0 - (0...a.size).each { |idx| - common_p_sum += [ a[idx], b[idx] ].min - all_p_sum += [ a[idx], b[idx] ].max - } - common_p_sum/all_p_sum - end - - - # Cosine similarity - # @param [Array] a fingerprints of first compound - # @param [Array] b fingerprints of second compound - # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b - def self.cosine(a, b) - val = 0.0 - if a.size>0 and b.size>0 - if a.size>12 && b.size>12 - a = a[0..11] - b = b[0..11] - end - a_vec = a.to_gv - b_vec = b.to_gv - val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm) - end - val - end - - end - - end -end diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb index cf9cbf3..8341a67 100644 --- a/lib/unique_descriptors.rb +++ b/lib/unique_descriptors.rb @@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib) "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib) "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib) - #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! + #"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! "Openbabel.logP", #octanol/water partition coefficient "Openbabel.MP", #Melting point "Openbabel.MR", #molar refractivity @@ -24,7 +24,7 @@ UNIQUEDESCRIPTORS = [ "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens). "Cdk.AcidicGroupCount", #Returns the number of acidic groups. - "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system + #"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule. #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule. #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type. @@ -75,7 +75,7 @@ UNIQUEDESCRIPTORS = [ "Joelib.count.NumberOfP", #no description available "Joelib.count.NumberOfO", #no description available "Joelib.count.NumberOfN", #no description available - #"Joelib.count.AromaticBonds", #no description available + #"Joeli#.count.AromaticBonds", #no description available "Joelib.count.NumberOfI", #no description available "Joelib.count.NumberOfF", #no description available "Joelib.count.NumberOfC", #no description available @@ -91,7 +91,7 @@ UNIQUEDESCRIPTORS = [ "Joelib.GeometricalShapeCoefficient", #no description available #"Joelib.MolecularWeight", #no description available "Joelib.FractionRotatableBonds", #no description available - #"Joelib.count.HBD2", #no description available + #"Joeli..count.HBD2", #no description available #"Joelib.count.HBD1", #no description available "Joelib.LogP", #no description available "Joelib.GraphShapeCoefficient", #no description available @@ -116,5 +116,4 @@ UNIQUEDESCRIPTORS = [ "Joelib.count.SOGroups", #no description available "Joelib.TopologicalDiameter", #no description available "Joelib.count.NumberOfHal", #no description available - -].sort +] diff --git a/lib/validation.rb b/lib/validation.rb index c52ffc0..b72d273 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -29,17 +29,21 @@ module OpenTox atts[:training_dataset_id] = training_set.id validation_model = model.class.create training_set, atts validation_model.save - test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + cids = test_set.compound_ids + + test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities predictions = [] nr_unpredicted = 0 activities = test_set.data_entries.collect{|de| de.first} prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] and de[1] and de[1].numeric? - activity = activities[i] + if de[0] #and de[1] + cid = prediction_dataset.compound_ids[i] + rows = cids.each_index.select{|r| cids[r] == cid } + activities = rows.collect{|r| test_set.data_entries[r][0]} prediction = de.first confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]] + predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] else nr_unpredicted += 1 end @@ -63,6 +67,42 @@ module OpenTox end class RegressionValidation < Validation + + def statistics + rmse = 0 + weighted_rmse = 0 + rse = 0 + weighted_rse = 0 + mae = 0 + weighted_mae = 0 + confidence_sum = 0 + predictions.each do |pred| + compound_id,activity,prediction,confidence = pred + if activity and prediction + error = Math.log10(prediction)-Math.log10(activity.median) + rmse += error**2 + weighted_rmse += confidence*error**2 + mae += error.abs + weighted_mae += confidence*error.abs + confidence_sum += confidence + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + x = predictions.collect{|p| p[1].median} + y = predictions.collect{|p| p[2]} + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + weighted_mae = weighted_mae/confidence_sum + rmse = Math.sqrt(rmse/predictions.size) + weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) + { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } + end end end |