summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-08-13 18:57:11 +0200
committerChristoph Helma <helma@in-silico.ch>2015-08-13 18:57:11 +0200
commitd0850e2983a219da214a67190fe881c7650f532f (patch)
treea917334a1a70823dc979a27e453b2598e98c8027 /lib
parent6ab86c253ba0eb79b9e6a20effa2d18626accf2b (diff)
majority of tests working
Diffstat (limited to 'lib')
-rw-r--r--lib/bbrc.rb12
-rw-r--r--lib/classification.rb8
-rw-r--r--lib/compound.rb14
-rw-r--r--lib/dataset.rb23
-rw-r--r--lib/descriptor.rb7
-rw-r--r--lib/lazar-model.rb21
-rw-r--r--lib/overwrite.rb8
-rw-r--r--lib/regression.rb2
8 files changed, 64 insertions, 31 deletions
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
index 6a2eed7..c83b9b3 100644
--- a/lib/bbrc.rb
+++ b/lib/bbrc.rb
@@ -26,6 +26,7 @@ module OpenTox
minfreq = params[:min_frequency]
else
per_mil = 5 # value from latest version
+ per_mil = 8 # as suggested below
i = training_dataset.feature_ids.index prediction_feature.id
nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
@@ -65,9 +66,11 @@ module OpenTox
# add data
training_dataset.compounds.each_with_index do |compound,i|
- @bbrc.AddCompound(compound.smiles,i+1)
act = value2act[training_dataset.data_entries[i].first]
- @bbrc.AddActivity(act,i+1)
+ if act # TODO check if this works
+ @bbrc.AddCompound(compound.smiles,i+1)
+ @bbrc.AddActivity(act,i+1)
+ end
end
#g_median=@fminer.all_activities.values.to_scale.median
@@ -94,6 +97,9 @@ module OpenTox
end
p_value = f.shift
f.flatten!
+ compound_idxs = f.collect{|e| e.first.first-1}
+ # majority class
+ effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
=begin
if (!@bbrc.GetRegression)
@@ -122,7 +128,7 @@ module OpenTox
feature = OpenTox::FminerSmarts.find_or_create_by({
"smarts" => smarts,
"p_value" => p_value.to_f.abs.round(5),
- #"effect" => effect,
+ "effect" => effect,
"dataset_id" => feature_dataset.id
})
feature_dataset.feature_ids << feature.id
diff --git a/lib/classification.rb b/lib/classification.rb
index fc6fa77..723c66f 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -4,7 +4,7 @@ module OpenTox
class Classification
def self.weighted_majority_vote neighbors
- return [nil,nil] if neighbors.empty?
+ return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
weighted_sum = {}
sim_sum = 0.0
neighbors.each do |row|
@@ -16,13 +16,13 @@ module OpenTox
end
case weighted_sum.size
when 1
- return [weighted_sum.keys.first, 1.0]
+ return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
when 2
sim_sum = weighted_sum[weighted_sum.keys[0]]
sim_sum -= weighted_sum[weighted_sum.keys[1]]
sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
confidence = (sim_sum/neighbors.size).abs
- return [prediction,confidence]
+ return {:value => prediction,:confidence => confidence}
else
bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
end
@@ -94,7 +94,7 @@ module OpenTox
#$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
end
- {:prediction => prediction, :confidence => confidence}
+ {:value => prediction, :confidence => confidence}
end
diff --git a/lib/compound.rb b/lib/compound.rb
index 5343aa0..10deabc 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -13,6 +13,7 @@ module OpenTox
field :smiles, type: String
field :inchikey, type: String
field :names, type: Array
+ field :warning, type: String
field :cid, type: String
field :chemblid, type: String
field :png_id, type: BSON::ObjectId
@@ -46,7 +47,12 @@ module OpenTox
# @return [OpenTox::Compound] Compound
def self.from_smiles smiles
# do not store smiles because it might be noncanonical
- Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
+ smiles = obconversion(smiles,"smi","can")
+ if smiles.empty?
+ Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
+ else
+ Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
+ end
end
# Create a compound from inchi string
@@ -57,7 +63,11 @@ module OpenTox
# http://sourceforge.net/p/openbabel/bugs/957/
# bug has not been fixed in latest git/development version
smiles = `echo "#{inchi}" | babel -iinchi - -ocan`.chomp.strip
- smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
+ if smiles.empty?
+ Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
+ else
+ Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
+ end
end
# Create a compound from sdf string
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 4f6f0b5..8c5ffc0 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -208,30 +208,29 @@ module OpenTox
value_time = 0
# compounds and values
- @data_entries = Array.new(table.size){Array.new(table.first.size-1)}
+ @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
table.each_with_index do |vals,i|
ct = Time.now
identifier = vals.shift
warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
begin
- # TODO parse inchi and catch openbabel errors (and segfaults) in compound.rb
case compound_format
when /SMILES/i
compound = OpenTox::Compound.from_smiles(identifier)
- if compound.inchi.empty?
- warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
- next
- end
when /InChI/i
compound = OpenTox::Compound.from_inchi(identifier)
end
- rescue
+ rescue
+ compound = nil
+ end
+ if compound.nil?
+ # compound parsers may return nil
warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
next
end
+ # TODO insert empty compounds to keep positions?
compound_time += Time.now-ct
- compound_ids << compound.id
r += 1
unless vals.size == feature_ids.size # way cheaper than accessing features
@@ -239,15 +238,17 @@ module OpenTox
next
end
- cid = compound.id.to_s
+ compound_ids << compound.id
+ @data_entries << Array.new(table.first.size-1)
+
vals.each_with_index do |v,j|
if v.blank?
warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
next
elsif numeric[j]
- @data_entries[i][j] = v.to_f
+ @data_entries.last[j] = v.to_f
else
- @data_entries[i][j] = v.strip
+ @data_entries.last[j] = v.strip
end
end
end
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index f0492a2..5ae0ef2 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -71,12 +71,6 @@ module OpenTox
@physchem_descriptors = nil
@data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
@compounds.each_with_index do |compound,c|
- # TODO OpenBabel may segfault here
- # catch inchi errors in compound.rb
- # eg. at line 249 of rat_feature_dataset
- # which worked with opentox-client
- # (but no smarts_match)
- #p "'#{compound.inchi}'"
obconversion.read_string(obmol,compound.smiles)
@smarts.each_with_index do |smart,s|
smarts_pattern.init(smart)
@@ -214,6 +208,7 @@ module OpenTox
end
def self.serialize
+ @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
case @input_class
when "OpenTox::Compound"
@data_entries.first
diff --git a/lib/lazar-model.rb b/lib/lazar-model.rb
index 4ca3403..aeaa515 100644
--- a/lib/lazar-model.rb
+++ b/lib/lazar-model.rb
@@ -9,7 +9,6 @@ module OpenTox
store_in collection: "models"
field :title, type: String
- field :endpoint, type: String
field :creator, type: String, default: __FILE__
# datasets
field :training_dataset_id, type: BSON::ObjectId
@@ -64,12 +63,18 @@ module OpenTox
# make predictions
predictions = []
+ neighbors = []
compounds.each_with_index do |compound,c|
t = Time.new
+ database_activities = training_dataset.values(compound,prediction_feature)
+ if database_activities and !database_activities.empty?
+ database_activities = database_activities.first if database_activities.size == 1
+ predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
+ next
+ end
neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
# add activities
# TODO: improve efficiency, takes 3 times longer than previous version
- # TODO database activity??
neighbors.collect! do |n|
rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
@@ -82,7 +87,9 @@ module OpenTox
# serialize result
case object.class.to_s
when "OpenTox::Compound"
- return predictions.first
+ prediction = predictions.first
+ prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
+ return prediction
when "Array"
return predictions
when "OpenTox::Dataset"
@@ -98,7 +105,7 @@ module OpenTox
warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
prediction_dataset.compounds = compounds
- prediction_dataset.data_entries = predictions
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
prediction_dataset.save_all
return prediction_dataset
end
@@ -281,6 +288,12 @@ module OpenTox
end
+ class PredictionModel < Lazar
+ field :category, type: String
+ field :endpoint, type: String
+ field :crossvalidation_id, type: BSON::ObjectId
+ end
+
end
end
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index a27d685..df515eb 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -22,6 +22,14 @@ module Enumerable
def duplicates
inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
end
+ # http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array
+ Enumerable.class_eval do
+ def mode
+ group_by do |e|
+ e
+ end.values.max_by(&:size).first
+ end
+ end
end
class String
diff --git a/lib/regression.rb b/lib/regression.rb
index 891d7f9..8a52e7d 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -31,7 +31,7 @@ module OpenTox
end
confidence = sim_sum/neighbors.size.to_f
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
- [prediction,confidence]
+ {:value => prediction,:confidence => confidence}
end
# Local support vector regression from neighbors