From d0850e2983a219da214a67190fe881c7650f532f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Aug 2015 18:57:11 +0200 Subject: majority of tests working --- lib/bbrc.rb | 12 +++++++++--- lib/classification.rb | 8 ++++---- lib/compound.rb | 14 ++++++++++++-- lib/dataset.rb | 23 ++++++++++++----------- lib/descriptor.rb | 7 +------ lib/lazar-model.rb | 21 +++++++++++++++++---- lib/overwrite.rb | 8 ++++++++ lib/regression.rb | 2 +- 8 files changed, 64 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/lib/bbrc.rb b/lib/bbrc.rb index 6a2eed7..c83b9b3 100644 --- a/lib/bbrc.rb +++ b/lib/bbrc.rb @@ -26,6 +26,7 @@ module OpenTox minfreq = params[:min_frequency] else per_mil = 5 # value from latest version + per_mil = 8 # as suggested below i = training_dataset.feature_ids.index prediction_feature.id nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST @@ -65,9 +66,11 @@ module OpenTox # add data training_dataset.compounds.each_with_index do |compound,i| - @bbrc.AddCompound(compound.smiles,i+1) act = value2act[training_dataset.data_entries[i].first] - @bbrc.AddActivity(act,i+1) + if act # TODO check if this works + @bbrc.AddCompound(compound.smiles,i+1) + @bbrc.AddActivity(act,i+1) + end end #g_median=@fminer.all_activities.values.to_scale.median @@ -94,6 +97,9 @@ module OpenTox end p_value = f.shift f.flatten! + compound_idxs = f.collect{|e| e.first.first-1} + # majority class + effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode =begin if (!@bbrc.GetRegression) @@ -122,7 +128,7 @@ module OpenTox feature = OpenTox::FminerSmarts.find_or_create_by({ "smarts" => smarts, "p_value" => p_value.to_f.abs.round(5), - #"effect" => effect, + "effect" => effect, "dataset_id" => feature_dataset.id }) feature_dataset.feature_ids << feature.id diff --git a/lib/classification.rb b/lib/classification.rb index fc6fa77..723c66f 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -4,7 +4,7 @@ module OpenTox class Classification def self.weighted_majority_vote neighbors - return [nil,nil] if neighbors.empty? + return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty? weighted_sum = {} sim_sum = 0.0 neighbors.each do |row| @@ -16,13 +16,13 @@ module OpenTox end case weighted_sum.size when 1 - return [weighted_sum.keys.first, 1.0] + return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs} when 2 sim_sum = weighted_sum[weighted_sum.keys[0]] sim_sum -= weighted_sum[weighted_sum.keys[1]] sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] confidence = (sim_sum/neighbors.size).abs - return [prediction,confidence] + return {:value => prediction,:confidence => confidence} else bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" end @@ -94,7 +94,7 @@ module OpenTox #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) end - {:prediction => prediction, :confidence => confidence} + {:value => prediction, :confidence => confidence} end diff --git a/lib/compound.rb b/lib/compound.rb index 5343aa0..10deabc 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -13,6 +13,7 @@ module OpenTox field :smiles, type: String field :inchikey, type: String field :names, type: Array + field :warning, type: String field :cid, type: String field :chemblid, type: String field :png_id, type: BSON::ObjectId @@ -46,7 +47,12 @@ module OpenTox # @return [OpenTox::Compound] Compound def self.from_smiles smiles # do not store smiles because it might be noncanonical - Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") + smiles = obconversion(smiles,"smi","can") + if smiles.empty? + Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.") + else + Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") + end end # Create a compound from inchi string @@ -57,7 +63,11 @@ module OpenTox # http://sourceforge.net/p/openbabel/bugs/957/ # bug has not been fixed in latest git/development version smiles = `echo "#{inchi}" | babel -iinchi - -ocan`.chomp.strip - smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) + if smiles.empty? + Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.") + else + Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) + end end # Create a compound from sdf string diff --git a/lib/dataset.rb b/lib/dataset.rb index 4f6f0b5..8c5ffc0 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -208,30 +208,29 @@ module OpenTox value_time = 0 # compounds and values - @data_entries = Array.new(table.size){Array.new(table.first.size-1)} + @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)} table.each_with_index do |vals,i| ct = Time.now identifier = vals.shift warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? begin - # TODO parse inchi and catch openbabel errors (and segfaults) in compound.rb case compound_format when /SMILES/i compound = OpenTox::Compound.from_smiles(identifier) - if compound.inchi.empty? - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." - next - end when /InChI/i compound = OpenTox::Compound.from_inchi(identifier) end - rescue + rescue + compound = nil + end + if compound.nil? + # compound parsers may return nil warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end + # TODO insert empty compounds to keep positions? compound_time += Time.now-ct - compound_ids << compound.id r += 1 unless vals.size == feature_ids.size # way cheaper than accessing features @@ -239,15 +238,17 @@ module OpenTox next end - cid = compound.id.to_s + compound_ids << compound.id + @data_entries << Array.new(table.first.size-1) + vals.each_with_index do |v,j| if v.blank? warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] - @data_entries[i][j] = v.to_f + @data_entries.last[j] = v.to_f else - @data_entries[i][j] = v.strip + @data_entries.last[j] = v.strip end end end diff --git a/lib/descriptor.rb b/lib/descriptor.rb index f0492a2..5ae0ef2 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -71,12 +71,6 @@ module OpenTox @physchem_descriptors = nil @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)} @compounds.each_with_index do |compound,c| - # TODO OpenBabel may segfault here - # catch inchi errors in compound.rb - # eg. at line 249 of rat_feature_dataset - # which worked with opentox-client - # (but no smarts_match) - #p "'#{compound.inchi}'" obconversion.read_string(obmol,compound.smiles) @smarts.each_with_index do |smart,s| smarts_pattern.init(smart) @@ -214,6 +208,7 @@ module OpenTox end def self.serialize + @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} case @input_class when "OpenTox::Compound" @data_entries.first diff --git a/lib/lazar-model.rb b/lib/lazar-model.rb index 4ca3403..aeaa515 100644 --- a/lib/lazar-model.rb +++ b/lib/lazar-model.rb @@ -9,7 +9,6 @@ module OpenTox store_in collection: "models" field :title, type: String - field :endpoint, type: String field :creator, type: String, default: __FILE__ # datasets field :training_dataset_id, type: BSON::ObjectId @@ -64,12 +63,18 @@ module OpenTox # make predictions predictions = [] + neighbors = [] compounds.each_with_index do |compound,c| t = Time.new + database_activities = training_dataset.values(compound,prediction_feature) + if database_activities and !database_activities.empty? + database_activities = database_activities.first if database_activities.size == 1 + predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} + next + end neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) # add activities # TODO: improve efficiency, takes 3 times longer than previous version - # TODO database activity?? neighbors.collect! do |n| rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact @@ -82,7 +87,9 @@ module OpenTox # serialize result case object.class.to_s when "OpenTox::Compound" - return predictions.first + prediction = predictions.first + prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity + return prediction when "Array" return predictions when "OpenTox::Dataset" @@ -98,7 +105,7 @@ module OpenTox warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]} prediction_dataset.save_all return prediction_dataset end @@ -281,6 +288,12 @@ module OpenTox end + class PredictionModel < Lazar + field :category, type: String + field :endpoint, type: String + field :crossvalidation_id, type: BSON::ObjectId + end + end end diff --git a/lib/overwrite.rb b/lib/overwrite.rb index a27d685..df515eb 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -22,6 +22,14 @@ module Enumerable def duplicates inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys end + # http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array + Enumerable.class_eval do + def mode + group_by do |e| + e + end.values.max_by(&:size).first + end + end end class String diff --git a/lib/regression.rb b/lib/regression.rb index 891d7f9..8a52e7d 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -31,7 +31,7 @@ module OpenTox end confidence = sim_sum/neighbors.size.to_f sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) - [prediction,confidence] + {:value => prediction,:confidence => confidence} end # Local support vector regression from neighbors -- cgit v1.2.3