summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-05-12 15:23:01 +0200
committerChristoph Helma <helma@in-silico.ch>2016-05-12 15:23:01 +0200
commitb8bb12c8a163c238d7d4387c1914e2100bb660df (patch)
tree791d1524e2294d8a3a38658607a644d7576784ae /lib
parent937bfbaf058aea5973927cb3bf6b51028b312ed9 (diff)
enm study import fixed
Diffstat (limited to 'lib')
-rw-r--r--lib/classification.rb15
-rw-r--r--lib/compound.rb120
-rw-r--r--lib/crossvalidation.rb21
-rw-r--r--lib/dataset.rb77
-rw-r--r--lib/import.rb8
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/model.rb65
-rw-r--r--lib/nanoparticle.rb80
-rw-r--r--lib/regression.rb102
-rw-r--r--lib/substance.rb1
-rw-r--r--lib/validation.rb4
11 files changed, 270 insertions, 225 deletions
diff --git a/lib/classification.rb b/lib/classification.rb
index 4cc9201..48ff8b3 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -3,17 +3,15 @@ module OpenTox
class Classification
- def self.weighted_majority_vote compound, params
- neighbors = params[:neighbors]
- feature_id = params[:prediction_feature_id].to_s
- dataset_id = params[:training_dataset_id].to_s
+ def self.weighted_majority_vote substance, neighbors
sims = {}
- neighbors.each do |n|
- sim = n["tanimoto"]
- n["toxicities"][feature_id][dataset_id].each do |act|
+ neighbors.each do |neighbor|
+ sim = neighbor["similarity"]
+ activities = neighbor["toxicities"]
+ activities.each do |act|
sims[act] ||= []
sims[act] << sim
- end if n["toxicities"][feature_id][dataset_id]
+ end if activities
end
sim_all = sims.collect{|a,s| s}.flatten
sim_sum = sim_all.sum
@@ -26,7 +24,6 @@ module OpenTox
p_max = probabilities.collect{|a,p| p}.max
prediction = probabilities.key(p_max)
{:value => prediction,:probabilities => probabilities}
-
end
end
end
diff --git a/lib/compound.rb b/lib/compound.rb
index 0a9111b..2554d54 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -254,67 +254,69 @@ module OpenTox
self["chemblid"]
end
- def fingerprint_count_neighbors params
- # TODO fix
+# def fingerprint_count_neighbors params
+# # TODO fix
+# neighbors = []
+# query_fingerprint = self.fingerprint params[:type]
+# training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+# unless self == compound
+# candidate_fingerprint = compound.fingerprint params[:type]
+# features = (query_fingerprint + candidate_fingerprint).uniq
+# min_sum = 0
+# max_sum = 0
+# features.each do |f|
+# min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
+# min_sum += min
+# max_sum += max
+# end
+# max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
+# neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
+# end
+# end
+# neighbors.sort{|a,b| b.last <=> a.last}
+# end
+
+ def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:)
neighbors = []
- query_fingerprint = self.fingerprint params[:type]
- training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
- unless self == compound
- candidate_fingerprint = compound.fingerprint params[:type]
- features = (query_fingerprint + candidate_fingerprint).uniq
- min_sum = 0
- max_sum = 0
- features.each do |f|
- min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
- min_sum += min
- max_sum += max
- end
- max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
- neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
+ dataset = Dataset.find(dataset_id)
+ if type == DEFAULT_FINGERPRINT
+ neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id)
+ neighbors.each do |n|
+ n["toxicities"] = dataset.values(n["_id"],prediction_feature_id)
end
- end
- neighbors.sort{|a,b| b.last <=> a.last}
- end
-
- def fingerprint_neighbors params
- bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
- neighbors = []
- if params[:type] == DEFAULT_FINGERPRINT
- neighbors = db_neighbors params
else
- query_fingerprint = self.fingerprint params[:type]
- training_dataset = Dataset.find(params[:training_dataset_id])
- prediction_feature = training_dataset.features.first
- training_dataset.compounds.each do |compound|
- candidate_fingerprint = compound.fingerprint params[:type]
- sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
- fid = prediction_feature.id.to_s
- did = params[:training_dataset_id].to_s
- v = compound.toxicities[prediction_feature.id.to_s]
- neighbors << {"_id" => compound.id, "toxicities" => {fid => {did => v[params[:training_dataset_id].to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] and v
- end
- neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
- end
- neighbors
- end
-
- def physchem_neighbors params
- feature_dataset = Dataset.find params[:feature_dataset_id]
- query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
- neighbors = []
- feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
- # TODO implement pearson and cosine similarity separatly
- R.assign "x", query_fingerprint
- R.assign "y", candidate_fingerprint
- sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
- if sim >= params[:min_sim]
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ query_fingerprint = self.fingerprint type
+ dataset.compounds.each do |compound|
+ values = dataset.values(compound,prediction_feature_id)
+ if values
+ candidate_fingerprint = compound.fingerprint type
+ sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint)
+ neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
+ end
end
+ neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
end
neighbors
end
- def db_neighbors params
+# def physchem_neighbors params
+# # TODO: fix, tests
+# feature_dataset = Dataset.find params[:feature_dataset_id]
+# query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
+# neighbors = []
+# feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+# # TODO implement pearson and cosine similarity separatly
+# R.assign "x", query_fingerprint
+# R.assign "y", candidate_fingerprint
+# sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
+# if sim >= params[:min_sim]
+# neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+# end
+# end
+# neighbors
+# end
+
+ def db_neighbors min_sim: 0.1, dataset_id:
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
#qn = default_fingerprint_size
@@ -326,20 +328,20 @@ module OpenTox
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
#{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
{'$project' => {
- 'tanimoto' => {'$let' => {
+ 'similarity' => {'$let' => {
'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
- #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
}},
'_id' => 1,
- 'toxicities' => 1,
+ #'toxicities' => 1,
'dataset_ids' => 1
}},
- {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
- {'$sort' => {'tanimoto' => -1}}
+ {'$match' => {'similarity' => {'$gte' => min_sim}}},
+ {'$sort' => {'similarity' => -1}}
]
- $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
+ # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
+ $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 8e0c5b9..da4b731 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -77,6 +77,7 @@ module OpenTox
def statistics
stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values)
update_attributes(stat)
+ stat
end
def confidence_plot
@@ -120,6 +121,7 @@ module OpenTox
def statistics
stat = ValidationStatistics.regression predictions
update_attributes(stat)
+ stat
end
def misclassifications n=nil
@@ -164,24 +166,29 @@ module OpenTox
end
def correlation_plot
- unless correlation_plot_id
+ #unless correlation_plot_id
tmpfile = "/tmp/#{id.to_s}_correlation.png"
- x = predictions.collect{|p| p[1]}
- y = predictions.collect{|p| p[2]}
+ x = []
+ y = []
+ predictions.each do |sid,p|
+ x << p["value"]
+ y << p["measured"].median
+ end
attributes = Model::Lazar.find(self.model_id).attributes
attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
R.assign "measurement", x
R.assign "prediction", y
- R.eval "all = c(-log(measurement),-log(prediction))"
+ R.eval "all = c(measurement,prediction)"
R.eval "range = c(min(all), max(all))"
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+ R.eval "image = qplot(prediction,measurement,main='#{self.name}',asp=1,xlim=range, ylim=range)"
R.eval "image = image + geom_abline(intercept=0, slope=1)"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ #R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ R.eval "ggsave(file='#{tmpfile}')"
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
plot_id = $gridfs.insert_one(file)
update(:correlation_plot_id => plot_id)
- end
+ #end
$gridfs.find_one(_id: correlation_plot_id).data
end
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 9738c1f..8c7fe68 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -7,6 +7,7 @@ module OpenTox
field :substance_ids, type: Array, default: []
field :feature_ids, type: Array, default: []
+ field :data_entries, type: Hash, default: {}
# Readers
@@ -30,6 +31,16 @@ module OpenTox
@features
end
+ def values substance,feature
+ substance = substance.id if substance.is_a? Substance
+ feature = feature.id if feature.is_a? Feature
+ if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
+ data_entries[substance.to_s][feature.to_s]
+ else
+ nil
+ end
+ end
+
# Writers
# Set compounds
@@ -42,6 +53,14 @@ module OpenTox
self.feature_ids = features.collect{|f| f.id}
end
+ def add(substance,feature,value)
+ substance = substance.id if substance.is_a? Substance
+ feature = feature.id if feature.is_a? Feature
+ data_entries[substance.to_s] ||= {}
+ data_entries[substance.to_s][feature.to_s] ||= []
+ data_entries[substance.to_s][feature.to_s] << value
+ end
+
# Dataset operations
# Split a dataset into n folds
@@ -64,11 +83,10 @@ module OpenTox
dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id )
dataset.substances.each do |substance|
substance.dataset_ids << dataset.id
- substance.toxicities.each do |feature_id,data|
- data[dataset.id.to_s] = data[self.id.to_s] # copy data entries
- end
substance.save
+ dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
end
+ dataset.save
dataset
end
start = last+1
@@ -95,7 +113,7 @@ module OpenTox
else
name = substance.name
end
- nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq
+ nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
if nr_measurements.size > 1
warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
@@ -103,8 +121,8 @@ module OpenTox
(0..nr_measurements.first-1).each do |i|
row = [name]
features.each do |f|
- if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s]
- row << substance.toxicities[f.id.to_s][self.id.to_s][i]
+ if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s]
+ row << data_entries[substance.id.to_s][f.id.to_s]
else
row << ""
end
@@ -146,8 +164,6 @@ module OpenTox
# does a lot of guesswork in order to determine feature types
def parse_table table
- time = Time.now
-
# features
feature_names = table.shift.collect{|f| f.strip}
warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
@@ -174,39 +190,31 @@ module OpenTox
feature_ids << feature.id if feature
end
- $logger.debug "Feature values: #{Time.now-time}"
- time = Time.now
-
- r = -1
- compound_time = 0
- value_time = 0
-
- # compounds and values
+ # substances and values
table.each_with_index do |vals,i|
- ct = Time.now
identifier = vals.shift.strip
warn "No feature values for compound at position #{i+2}." if vals.compact.empty?
begin
case compound_format
when /SMILES/i
- compound = OpenTox::Compound.from_smiles(identifier)
+ substance = OpenTox::Compound.from_smiles(identifier)
when /InChI/i
- compound = OpenTox::Compound.from_inchi(identifier)
+ substance = OpenTox::Compound.from_inchi(identifier)
# TODO nanoparticle
end
rescue
- compound = nil
+ substance = nil
end
- if compound.nil? # compound parsers may return nil
+ if substance.nil? # compound parsers may return nil
warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
next
end
- substance_ids << compound.id
- compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
- compound_time += Time.now-ct
+ substance_ids << substance.id
+ data_entries[substance.id.to_s] = {}
+ substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id
+ substance.save
- r += 1
unless vals.size == feature_ids.size
warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
next
@@ -214,32 +222,25 @@ module OpenTox
vals.each_with_index do |v,j|
if v.blank?
- warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
+ warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
next
elsif numeric[j]
v = v.to_f
else
v = v.strip
end
- compound.toxicities[feature_ids[j].to_s] ||= {}
- compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= []
- compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v
- compound.save
+ data_entries[substance.id.to_s][feature_ids[j].to_s] ||= []
+ data_entries[substance.id.to_s][feature_ids[j].to_s] << v
end
end
- compounds.duplicates.each do |compound|
+ substances.duplicates.each do |substance|
positions = []
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
- warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
+ warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
substance_ids.uniq!
feature_ids.uniq!
-
- $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
- time = Time.now
save
- $logger.debug "Saving: #{Time.now-time}"
-
end
end
diff --git a/lib/import.rb b/lib/import.rb
index dfe5e2d..3c6966e 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -9,16 +9,18 @@ module OpenTox
#get list of bundle URIs
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)}
- datasets = []
bundles.each do |bundle|
+ p bundle["title"]
nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
+ p nanoparticles.size
nanoparticles.each do |nanoparticle|
uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"]
$logger.debug uuid
File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)}
studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"]
+ p uuid if studies.size < 1
studies.each do |study|
- File.open(File.join(dir,"study-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)}
+ File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)}
end
end
end
@@ -37,7 +39,7 @@ module OpenTox
:source => np["compound"]["URI"],
)
np["bundles"].keys.each do |bundle_uri|
- datasets[bundle_uri].substance_ids << nanoparticle.id
+ #datasets[bundle_uri].substance_ids << nanoparticle.id
nanoparticle["dataset_ids"] << datasets[bundle_uri].id
end
bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 140bca3..55de511 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i
R = Rserve::Connection.new
R.eval "
suppressPackageStartupMessages({
+ library(labeling,lib=\"#{rlib}\")
library(iterators,lib=\"#{rlib}\")
library(foreach,lib=\"#{rlib}\")
library(ggplot2,lib=\"#{rlib}\")
@@ -75,6 +76,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross
"nanoparticle.rb",
"dataset.rb",
"algorithm.rb",
+ "similarity",
"model.rb",
"classification.rb",
"regression.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 070248a..8baed41 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -30,7 +30,7 @@ module OpenTox
self.training_dataset_id ||= training_dataset.id
self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
self.neighbor_algorithm_parameters ||= {}
- self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+ self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id
Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm
save
@@ -41,7 +41,7 @@ module OpenTox
toxicities = []
substances = []
training_dataset.substances.each do |s|
- s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act|
+ training_dataset.values(s,prediction_feature_id).each do |act|
toxicities << act
substances << s
end
@@ -68,24 +68,41 @@ module OpenTox
relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
end
- def predict_compound compound
- neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
- # remove neighbors without prediction_feature
- # check for database activities (neighbors may include query compound)
+ def predict_substance substance
+ neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters)
database_activities = nil
prediction = {}
- if neighbors.collect{|n| n["_id"]}.include? compound.id
+ # handle query substance
+ if neighbors.collect{|n| n["_id"]}.include? substance.id
- me = neighbors.select{|n| n["_id"] == compound.id}.first
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq
+ query = neighbors.select{|n| n["_id"] == substance.id}.first
+ database_activities = training_dataset.values(query["_id"],prediction_feature_id)
prediction[:database_activities] = database_activities
- prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
- neighbors.delete_if{|n| n["_id"] == compound.id}
+ prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance."
+ neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation)
end
if neighbors.empty?
- prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
+ prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
+ elsif neighbors.size == 1
+ value = nil
+ tox = neighbors.first["toxicities"]
+ if tox.size == 1 # single measurement
+ value = tox
+ else # multiple measurement
+ if tox.collect{|t| t.numeric?}.uniq == [true] # numeric
+ value = tox.median
+ elsif tox.uniq.size == 1 # single value
+ value = tox.first
+ else # contradictory results
+ # TODO add majority vote
+ end
+ end
+ prediction.merge!({:value => value, :confidence => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values."}) if value
else
- prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
+ # call prediction algorithm
+ klass,method = prediction_algorithm.split('.')
+ result = Object.const_get(klass).send(method,substance,neighbors)
+ prediction.merge! result
prediction[:neighbors] = neighbors
prediction[:neighbors] ||= []
end
@@ -97,27 +114,27 @@ module OpenTox
training_dataset = Dataset.find training_dataset_id
# parse data
- compounds = []
+ substances = []
if object.is_a? Substance
- compounds = [object]
+ substances = [object]
elsif object.is_a? Array
- compounds = object
+ substances = object
elsif object.is_a? Dataset
- compounds = object.compounds
+ substances = object.substances
else
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
end
# make predictions
predictions = {}
- compounds.each do |c|
- predictions[c.id.to_s] = predict_compound c
+ substances.each do |c|
+ predictions[c.id.to_s] = predict_substance c
predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
end
# serialize result
if object.is_a? Substance
- prediction = predictions[compounds.first.id.to_s]
+ prediction = predictions[substances.first.id.to_s]
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
return prediction
elsif object.is_a? Array
@@ -160,7 +177,8 @@ module OpenTox
model.neighbor_algorithm_parameters ||= {}
{
:type => "MP2D",
- :training_dataset_id => training_dataset.id,
+ :dataset_id => training_dataset.id,
+ :prediction_feature_id => prediction_feature.id,
:min_sim => 0.1
}.each do |key,value|
model.neighbor_algorithm_parameters[key] ||= value
@@ -179,8 +197,9 @@ module OpenTox
model.neighbor_algorithm_parameters ||= {}
{
:type => "MP2D",
- :training_dataset_id => training_dataset.id,
- :min_sim => 0.1
+ :min_sim => 0.1,
+ :dataset_id => training_dataset.id,
+ :prediction_feature_id => prediction_feature.id,
}.each do |key,value|
model.neighbor_algorithm_parameters[key] ||= value
end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index b79981d..6527fa3 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -8,15 +8,31 @@ module OpenTox
field :bundles, type: Array, default: []
field :proteomics, type: Hash, default: {}
- def nanoparticle_neighbors params
- dataset = Dataset.find(params[:training_dataset_id])
- Dataset.find(params[:training_dataset_id]).nanoparticles.collect do |np|
- np["tanimoto"] = 1
- np unless np.toxicities.empty?
- end.compact
+ def nanoparticle_neighbors min_sim: 0.1, type:, dataset_id:, prediction_feature_id:
+ dataset = Dataset.find(dataset_id)
+ neighbors = []
+ p dataset.data_entries.size
+ p dataset.substance_ids.size
+ p dataset.substance_ids.collect{|i| i.to_s} == dataset.data_entries.keys
+ p dataset.substance_ids.collect{|i| i.to_s}
+ p dataset.data_entries.keys
+ dataset.nanoparticles.each do |np|
+ prediction_feature_id
+ p dataset.data_entries[np.id.to_s]
+ values = dataset.values(np,prediction_feature_id)
+ p values
+ if values
+ common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys
+ sim = Algorithm::Similarity.cosine(common_descriptors.collect{|d| physchem_descriptors[d]}, common_descriptors.collect{|d| np.physchem_descriptors[d]})
+ neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim
+ end
+ end
+ neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
+ neighbors
end
def add_feature feature, value, dataset_id
+ dataset = Dataset.find(dataset_id)
case feature.category
when "P-CHEM"
physchem_descriptors[feature.id.to_s] ||= []
@@ -27,55 +43,59 @@ module OpenTox
proteomics[feature.id.to_s] << value
proteomics[feature.id.to_s].uniq!
when "TOX"
- toxicities[feature.id.to_s] ||= {}
- toxicities[feature.id.to_s][dataset_id.to_s] ||= []
# TODO generic way of parsing TOX values
+ p dataset.name
+ p self.name
+ p feature.name
+ p feature.unit
+ p value
if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)"
- toxicities[feature.id.to_s][dataset_id.to_s] << -Math.log10(value)
+ dataset.add self, feature, -Math.log10(value)
else
- toxicities[feature.id.to_s][dataset_id.to_s] << value
+ dataset.add self, feature, value
end
- toxicities[feature.id.to_s][dataset_id.to_s].uniq!
+ dataset.save
else
warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
end
end
def parse_ambit_value feature, v, dataset_id
+ dataset = Dataset.find(dataset_id)
v.delete "unit"
# TODO: ppm instead of weights
if v.keys == ["textValue"]
- add_feature feature, v["textValue"], dataset_id
+ add_feature feature, v["textValue"], dataset
elsif v.keys == ["loValue"]
- add_feature feature, v["loValue"], dataset_id
+ add_feature feature, v["loValue"], dataset
elsif v.keys.size == 2 and v["errorValue"]
- add_feature feature, v["loValue"], dataset_id
- warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
+ add_feature feature, v["loValue"], dataset
+ #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
elsif v.keys.size == 2 and v["loQualifier"] == "mean"
- add_feature feature, v["loValue"], dataset_id
- warn "'#{feature.name}' is a mean value. Original data is not available."
+ add_feature feature, v["loValue"], dataset
+ #warn "'#{feature.name}' is a mean value. Original data is not available."
elsif v.keys.size == 2 and v["loQualifier"] #== ">="
- warn "Only min value available for '#{feature.name}', entry ignored"
+ #warn "Only min value available for '#{feature.name}', entry ignored"
elsif v.keys.size == 2 and v["upQualifier"] #== ">="
- warn "Only max value available for '#{feature.name}', entry ignored"
+ #warn "Only max value available for '#{feature.name}', entry ignored"
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
- add_feature feature, v["loValue"], dataset_id
- warn "loQualifier and upQualifier are empty."
+ add_feature feature, v["loValue"], dataset
+ #warn "loQualifier and upQualifier are empty."
elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
- add_feature feature, v["loValue"], dataset_id
- warn "loQualifier and upQualifier are empty."
+ add_feature feature, v["loValue"], dataset
+ #warn "loQualifier and upQualifier are empty."
elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
- add_feature feature, v["loValue"], dataset_id
- warn "loQualifier and upQualifier are empty."
+ add_feature feature, v["loValue"], dataset
+ #warn "loQualifier and upQualifier are empty."
elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
- add_feature feature, [v["loValue"],v["upValue"]].mean, dataset_id
- warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
+ add_feature feature, [v["loValue"],v["upValue"]].mean, dataset
+ #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
- warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
- add_feature feature, v["loValue"], dataset_id
+ #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
+ add_feature feature, v["loValue"], dataset
elsif v == {} # do nothing
else
- warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
+ #warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
end
end
diff --git a/lib/regression.rb b/lib/regression.rb
index 2eaae73..9d305a6 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,49 +3,43 @@ module OpenTox
class Regression
- def self.local_weighted_average compound, params
+ def self.local_weighted_average substance, neighbors
weighted_sum = 0.0
sim_sum = 0.0
- neighbors = params[:neighbors]
- neighbors.each do |row|
- sim = row["tanimoto"]
- sim ||= 1 # TODO: sim f nanoparticles
- if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]
- row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
- weighted_sum += sim*act
- sim_sum += sim
- end
- end
+ neighbors.each do |neighbor|
+ sim = neighbor["similarity"]
+ activities = neighbor["toxicities"]
+ activities.each do |act|
+ weighted_sum += sim*act
+ sim_sum += sim
+ end if activities
end
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
{:value => prediction}
end
- def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
- neighbors = params[:neighbors]
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
- activities = []
+ def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05"
+ values = []
fingerprints = {}
weights = []
- fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
-
- neighbors.each_with_index do |row,i|
- neighbor = Compound.find row["_id"]
- fingerprint = neighbor.fingerprint
- if row["toxicities"][params[:prediction_feature_id].to_s]
- row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
- activities << act
- weights << row["tanimoto"]
- fingerprint_ids.each_with_index do |id,j|
- fingerprints[id] ||= []
- fingerprints[id] << fingerprint.include?(id)
- end
+ fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort
+
+ neighbors.each do |n|
+ fingerprint = Substance.find(n["_id"]).fingerprint
+ activities = n["toxicities"]
+ activities.each do |act|
+ values << act
+ weights << n["similarity"]
+ fingerprint_ids.each do |id|
+ fingerprints[id] ||= []
+ fingerprints[id] << fingerprint.include?(id)
end
- end
+ end if activities
end
variables = []
- data_frame = [activities]
+ data_frame = [values]
+
fingerprints.each do |k,v|
unless v.uniq.size == 1
data_frame << v.collect{|m| m ? "T" : "F"}
@@ -54,17 +48,16 @@ module OpenTox
end
if variables.empty?
- result = local_weighted_average(compound, params)
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
- return result
-
+ prediction = local_weighted_average substance, neighbors
+ prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ prediction
else
- compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
- prediction = r_model_prediction method, data_frame, variables, weights, compound_features
+ substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"}
+ prediction = r_model_prediction method, data_frame, variables, weights, substance_features
if prediction.nil? or prediction[:value].nil?
- prediction = local_weighted_average(compound, params)
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
- return prediction
+ prediction = local_weighted_average substance, neighbors
+ prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
+ prediction
else
prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
prediction[:value] = prediction[:value]
@@ -75,13 +68,10 @@ module OpenTox
end
- def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4"
-
- neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities
-
- return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
- return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+ #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4"
+ def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4"
+ #dataset = Dataset.find dataset_id
activities = []
weights = []
pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
@@ -90,9 +80,11 @@ module OpenTox
neighbors.each_with_index do |n,i|
neighbor = Substance.find(n["_id"])
- n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
+ activities = neighbor["toxicities"]
+ activities.each do |act|
data_frame[0][i] = act
- n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+ # TODO: update with cosine similarity for physchem
+ weights << n["similarity"]
neighbor.physchem_descriptors.each do |pid,values|
values = [values] unless values.is_a? Array
values.uniq!
@@ -101,7 +93,7 @@ module OpenTox
data_frame[j] ||= []
data_frame[j][i] = values.for_R
end
- end
+ end if activities
(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
data_frame[j] ||= []
data_frame[j][i] ||= "NA"
@@ -117,12 +109,12 @@ module OpenTox
end
if pc_ids.empty?
- result = local_weighted_average(compound, params)
- result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
- return result
+ prediction = local_weighted_average substance, neighbors
+ prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ prediction
else
query_descriptors = pc_ids.collect do |i|
- compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+ substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA"
end
remove_idx = []
query_descriptors.each_with_index do |v,i|
@@ -135,9 +127,9 @@ module OpenTox
end
prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
if prediction.nil?
- prediction = local_weighted_average(compound, params)
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
- return prediction
+ prediction = local_weighted_average substance, neighbors
+ prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
+ prediction
else
prediction
end
diff --git a/lib/substance.rb b/lib/substance.rb
index 82ca65d..6768ce7 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -2,7 +2,6 @@ module OpenTox
class Substance
field :physchem_descriptors, type: Hash, default: {}
- field :toxicities, type: Hash, default: {}
field :dataset_ids, type: Array, default: []
end
diff --git a/lib/validation.rb b/lib/validation.rb
index 334efd7..015e718 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -32,9 +32,12 @@ module OpenTox
predictions = validation_model.predict test_set.substances
predictions.each{|cid,p| p.delete(:neighbors)}
nr_unpredicted = 0
+ p predictions.size
predictions.each do |cid,prediction|
+ p prediction
if prediction[:value]
tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s]
+ p tox
#prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][test_set.id.to_s]
prediction[:measured] = tox[test_set.id.to_s] if tox
else
@@ -42,6 +45,7 @@ module OpenTox
end
predictions.delete(cid) unless prediction[:value] and prediction[:measured]
end
+ p predictions.size
validation = self.new(
:model_id => validation_model.id,
:test_dataset_id => test_set.id,