From b8bb12c8a163c238d7d4387c1914e2100bb660df Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 12 May 2016 15:23:01 +0200 Subject: enm study import fixed --- lib/classification.rb | 15 +++---- lib/compound.rb | 120 +++++++++++++++++++++++++------------------------ lib/crossvalidation.rb | 21 ++++++--- lib/dataset.rb | 77 +++++++++++++++---------------- lib/import.rb | 8 ++-- lib/lazar.rb | 2 + lib/model.rb | 65 +++++++++++++++++---------- lib/nanoparticle.rb | 80 ++++++++++++++++++++------------- lib/regression.rb | 102 +++++++++++++++++++---------------------- lib/substance.rb | 1 - lib/validation.rb | 4 ++ 11 files changed, 270 insertions(+), 225 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 4cc9201..48ff8b3 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,17 +3,15 @@ module OpenTox class Classification - def self.weighted_majority_vote compound, params - neighbors = params[:neighbors] - feature_id = params[:prediction_feature_id].to_s - dataset_id = params[:training_dataset_id].to_s + def self.weighted_majority_vote substance, neighbors sims = {} - neighbors.each do |n| - sim = n["tanimoto"] - n["toxicities"][feature_id][dataset_id].each do |act| + neighbors.each do |neighbor| + sim = neighbor["similarity"] + activities = neighbor["toxicities"] + activities.each do |act| sims[act] ||= [] sims[act] << sim - end if n["toxicities"][feature_id][dataset_id] + end if activities end sim_all = sims.collect{|a,s| s}.flatten sim_sum = sim_all.sum @@ -26,7 +24,6 @@ module OpenTox p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) {:value => prediction,:probabilities => probabilities} - end end end diff --git a/lib/compound.rb b/lib/compound.rb index 0a9111b..2554d54 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -254,67 +254,69 @@ module OpenTox self["chemblid"] end - def fingerprint_count_neighbors params - # TODO fix +# def fingerprint_count_neighbors params +# # TODO fix +# neighbors = [] +# query_fingerprint = self.fingerprint params[:type] +# training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| +# unless self == compound +# candidate_fingerprint = compound.fingerprint params[:type] +# features = (query_fingerprint + candidate_fingerprint).uniq +# min_sum = 0 +# max_sum = 0 +# features.each do |f| +# min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax +# min_sum += min +# max_sum += max +# end +# max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f +# neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] +# end +# end +# neighbors.sort{|a,b| b.last <=> a.last} +# end + + def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] - query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| - unless self == compound - candidate_fingerprint = compound.fingerprint params[:type] - features = (query_fingerprint + candidate_fingerprint).uniq - min_sum = 0 - max_sum = 0 - features.each do |f| - min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax - min_sum += min - max_sum += max - end - max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f - neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] + dataset = Dataset.find(dataset_id) + if type == DEFAULT_FINGERPRINT + neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) + neighbors.each do |n| + n["toxicities"] = dataset.values(n["_id"],prediction_feature_id) end - end - neighbors.sort{|a,b| b.last <=> a.last} - end - - def fingerprint_neighbors params - bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] - neighbors = [] - if params[:type] == DEFAULT_FINGERPRINT - neighbors = db_neighbors params else - query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]) - prediction_feature = training_dataset.features.first - training_dataset.compounds.each do |compound| - candidate_fingerprint = compound.fingerprint params[:type] - sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - fid = prediction_feature.id.to_s - did = params[:training_dataset_id].to_s - v = compound.toxicities[prediction_feature.id.to_s] - neighbors << {"_id" => compound.id, "toxicities" => {fid => {did => v[params[:training_dataset_id].to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] and v - end - neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} - end - neighbors - end - - def physchem_neighbors params - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] - neighbors = [] - feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| - # TODO implement pearson and cosine similarity separatly - R.assign "x", query_fingerprint - R.assign "y", candidate_fingerprint - sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first - if sim >= params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + query_fingerprint = self.fingerprint type + dataset.compounds.each do |compound| + values = dataset.values(compound,prediction_feature_id) + if values + candidate_fingerprint = compound.fingerprint type + sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) + neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + end end + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} end neighbors end - def db_neighbors params +# def physchem_neighbors params +# # TODO: fix, tests +# feature_dataset = Dataset.find params[:feature_dataset_id] +# query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] +# neighbors = [] +# feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| +# # TODO implement pearson and cosine similarity separatly +# R.assign "x", query_fingerprint +# R.assign "y", candidate_fingerprint +# sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first +# if sim >= params[:min_sim] +# neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming +# end +# end +# neighbors +# end + + def db_neighbors min_sim: 0.1, dataset_id: # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb #qn = default_fingerprint_size @@ -326,20 +328,20 @@ module OpenTox #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self {'$project' => { - 'tanimoto' => {'$let' => { + 'similarity' => {'$let' => { 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, - #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}}, 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - 'toxicities' => 1, + #'toxicities' => 1, 'dataset_ids' => 1 }}, - {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, - {'$sort' => {'tanimoto' => -1}} + {'$match' => {'similarity' => {'$gte' => min_sim}}}, + {'$sort' => {'similarity' => -1}} ] - $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array + $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 8e0c5b9..da4b731 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -77,6 +77,7 @@ module OpenTox def statistics stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) update_attributes(stat) + stat end def confidence_plot @@ -120,6 +121,7 @@ module OpenTox def statistics stat = ValidationStatistics.regression predictions update_attributes(stat) + stat end def misclassifications n=nil @@ -164,24 +166,29 @@ module OpenTox end def correlation_plot - unless correlation_plot_id + #unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} + x = [] + y = [] + predictions.each do |sid,p| + x << p["value"] + y << p["measured"].median + end attributes = Model::Lazar.find(self.model_id).attributes attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") R.assign "measurement", x R.assign "prediction", y - R.eval "all = c(-log(measurement),-log(prediction))" + R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.eval "image = qplot(prediction,measurement,main='#{self.name}',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" + #R.eval "ggsave(file='#{tmpfile}', plot=image)" + R.eval "ggsave(file='#{tmpfile}')" file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end end diff --git a/lib/dataset.rb b/lib/dataset.rb index 9738c1f..8c7fe68 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -7,6 +7,7 @@ module OpenTox field :substance_ids, type: Array, default: [] field :feature_ids, type: Array, default: [] + field :data_entries, type: Hash, default: {} # Readers @@ -30,6 +31,16 @@ module OpenTox @features end + def values substance,feature + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] + data_entries[substance.to_s][feature.to_s] + else + nil + end + end + # Writers # Set compounds @@ -42,6 +53,14 @@ module OpenTox self.feature_ids = features.collect{|f| f.id} end + def add(substance,feature,value) + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + data_entries[substance.to_s] ||= {} + data_entries[substance.to_s][feature.to_s] ||= [] + data_entries[substance.to_s][feature.to_s] << value + end + # Dataset operations # Split a dataset into n folds @@ -64,11 +83,10 @@ module OpenTox dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.substances.each do |substance| substance.dataset_ids << dataset.id - substance.toxicities.each do |feature_id,data| - data[dataset.id.to_s] = data[self.id.to_s] # copy data entries - end substance.save + dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end + dataset.save dataset end start = last+1 @@ -95,7 +113,7 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." @@ -103,8 +121,8 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s] - row << substance.toxicities[f.id.to_s][self.id.to_s][i] + if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s] + row << data_entries[substance.id.to_s][f.id.to_s] else row << "" end @@ -146,8 +164,6 @@ module OpenTox # does a lot of guesswork in order to determine feature types def parse_table table - time = Time.now - # features feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size @@ -174,39 +190,31 @@ module OpenTox feature_ids << feature.id if feature end - $logger.debug "Feature values: #{Time.now-time}" - time = Time.now - - r = -1 - compound_time = 0 - value_time = 0 - - # compounds and values + # substances and values table.each_with_index do |vals,i| - ct = Time.now identifier = vals.shift.strip warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i - compound = OpenTox::Compound.from_smiles(identifier) + substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i - compound = OpenTox::Compound.from_inchi(identifier) + substance = OpenTox::Compound.from_inchi(identifier) # TODO nanoparticle end rescue - compound = nil + substance = nil end - if compound.nil? # compound parsers may return nil + if substance.nil? # compound parsers may return nil warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - substance_ids << compound.id - compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id - compound_time += Time.now-ct + substance_ids << substance.id + data_entries[substance.id.to_s] = {} + substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id + substance.save - r += 1 unless vals.size == feature_ids.size warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next @@ -214,32 +222,25 @@ module OpenTox vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." next elsif numeric[j] v = v.to_f else v = v.strip end - compound.toxicities[feature_ids[j].to_s] ||= {} - compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= [] - compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v - compound.save + data_entries[substance.id.to_s][feature_ids[j].to_s] ||= [] + data_entries[substance.id.to_s][feature_ids[j].to_s] << v end end - compounds.duplicates.each do |compound| + substances.duplicates.each do |substance| positions = [] - compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} + warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end substance_ids.uniq! feature_ids.uniq! - - $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" - time = Time.now save - $logger.debug "Saving: #{Time.now-time}" - end end diff --git a/lib/import.rb b/lib/import.rb index dfe5e2d..3c6966e 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -9,16 +9,18 @@ module OpenTox #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} - datasets = [] bundles.each do |bundle| + p bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] + p nanoparticles.size nanoparticles.each do |nanoparticle| uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] $logger.debug uuid File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] + p uuid if studies.size < 1 studies.each do |study| - File.open(File.join(dir,"study-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} + File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} end end end @@ -37,7 +39,7 @@ module OpenTox :source => np["compound"]["URI"], ) np["bundles"].keys.each do |bundle_uri| - datasets[bundle_uri].substance_ids << nanoparticle.id + #datasets[bundle_uri].substance_ids << nanoparticle.id nanoparticle["dataset_ids"] << datasets[bundle_uri].id end bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1 diff --git a/lib/lazar.rb b/lib/lazar.rb index 140bca3..55de511 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i R = Rserve::Connection.new R.eval " suppressPackageStartupMessages({ + library(labeling,lib=\"#{rlib}\") library(iterators,lib=\"#{rlib}\") library(foreach,lib=\"#{rlib}\") library(ggplot2,lib=\"#{rlib}\") @@ -75,6 +76,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "nanoparticle.rb", "dataset.rb", "algorithm.rb", + "similarity", "model.rb", "classification.rb", "regression.rb", diff --git a/lib/model.rb b/lib/model.rb index 070248a..8baed41 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -30,7 +30,7 @@ module OpenTox self.training_dataset_id ||= training_dataset.id self.name ||= "#{training_dataset.name} #{prediction_feature.name}" self.neighbor_algorithm_parameters ||= {} - self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm save @@ -41,7 +41,7 @@ module OpenTox toxicities = [] substances = [] training_dataset.substances.each do |s| - s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act| + training_dataset.values(s,prediction_feature_id).each do |act| toxicities << act substances << s end @@ -68,24 +68,41 @@ module OpenTox relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h end - def predict_compound compound - neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) - # remove neighbors without prediction_feature - # check for database activities (neighbors may include query compound) + def predict_substance substance + neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) database_activities = nil prediction = {} - if neighbors.collect{|n| n["_id"]}.include? compound.id + # handle query substance + if neighbors.collect{|n| n["_id"]}.include? substance.id - me = neighbors.select{|n| n["_id"] == compound.id}.first - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq + query = neighbors.select{|n| n["_id"] == substance.id}.first + database_activities = training_dataset.values(query["_id"],prediction_feature_id) prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." - neighbors.delete_if{|n| n["_id"] == compound.id} + prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance." + neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? - prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) + prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbors.size == 1 + value = nil + tox = neighbors.first["toxicities"] + if tox.size == 1 # single measurement + value = tox + else # multiple measurement + if tox.collect{|t| t.numeric?}.uniq == [true] # numeric + value = tox.median + elsif tox.uniq.size == 1 # single value + value = tox.first + else # contradictory results + # TODO add majority vote + end + end + prediction.merge!({:value => value, :confidence => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values."}) if value else - prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id})) + # call prediction algorithm + klass,method = prediction_algorithm.split('.') + result = Object.const_get(klass).send(method,substance,neighbors) + prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] end @@ -97,27 +114,27 @@ module OpenTox training_dataset = Dataset.find training_dataset_id # parse data - compounds = [] + substances = [] if object.is_a? Substance - compounds = [object] + substances = [object] elsif object.is_a? Array - compounds = object + substances = object elsif object.is_a? Dataset - compounds = object.compounds + substances = object.substances else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." end # make predictions predictions = {} - compounds.each do |c| - predictions[c.id.to_s] = predict_compound c + substances.each do |c| + predictions[c.id.to_s] = predict_substance c predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end # serialize result if object.is_a? Substance - prediction = predictions[compounds.first.id.to_s] + prediction = predictions[substances.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity return prediction elsif object.is_a? Array @@ -160,7 +177,8 @@ module OpenTox model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", - :training_dataset_id => training_dataset.id, + :dataset_id => training_dataset.id, + :prediction_feature_id => prediction_feature.id, :min_sim => 0.1 }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value @@ -179,8 +197,9 @@ module OpenTox model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", - :training_dataset_id => training_dataset.id, - :min_sim => 0.1 + :min_sim => 0.1, + :dataset_id => training_dataset.id, + :prediction_feature_id => prediction_feature.id, }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index b79981d..6527fa3 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,15 +8,31 @@ module OpenTox field :bundles, type: Array, default: [] field :proteomics, type: Hash, default: {} - def nanoparticle_neighbors params - dataset = Dataset.find(params[:training_dataset_id]) - Dataset.find(params[:training_dataset_id]).nanoparticles.collect do |np| - np["tanimoto"] = 1 - np unless np.toxicities.empty? - end.compact + def nanoparticle_neighbors min_sim: 0.1, type:, dataset_id:, prediction_feature_id: + dataset = Dataset.find(dataset_id) + neighbors = [] + p dataset.data_entries.size + p dataset.substance_ids.size + p dataset.substance_ids.collect{|i| i.to_s} == dataset.data_entries.keys + p dataset.substance_ids.collect{|i| i.to_s} + p dataset.data_entries.keys + dataset.nanoparticles.each do |np| + prediction_feature_id + p dataset.data_entries[np.id.to_s] + values = dataset.values(np,prediction_feature_id) + p values + if values + common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys + sim = Algorithm::Similarity.cosine(common_descriptors.collect{|d| physchem_descriptors[d]}, common_descriptors.collect{|d| np.physchem_descriptors[d]}) + neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + end + end + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + neighbors end def add_feature feature, value, dataset_id + dataset = Dataset.find(dataset_id) case feature.category when "P-CHEM" physchem_descriptors[feature.id.to_s] ||= [] @@ -27,55 +43,59 @@ module OpenTox proteomics[feature.id.to_s] << value proteomics[feature.id.to_s].uniq! when "TOX" - toxicities[feature.id.to_s] ||= {} - toxicities[feature.id.to_s][dataset_id.to_s] ||= [] # TODO generic way of parsing TOX values + p dataset.name + p self.name + p feature.name + p feature.unit + p value if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" - toxicities[feature.id.to_s][dataset_id.to_s] << -Math.log10(value) + dataset.add self, feature, -Math.log10(value) else - toxicities[feature.id.to_s][dataset_id.to_s] << value + dataset.add self, feature, value end - toxicities[feature.id.to_s][dataset_id.to_s].uniq! + dataset.save else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end end def parse_ambit_value feature, v, dataset_id + dataset = Dataset.find(dataset_id) v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] - add_feature feature, v["textValue"], dataset_id + add_feature feature, v["textValue"], dataset elsif v.keys == ["loValue"] - add_feature feature, v["loValue"], dataset_id + add_feature feature, v["loValue"], dataset elsif v.keys.size == 2 and v["errorValue"] - add_feature feature, v["loValue"], dataset_id - warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + add_feature feature, v["loValue"], dataset + #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" - add_feature feature, v["loValue"], dataset_id - warn "'#{feature.name}' is a mean value. Original data is not available." + add_feature feature, v["loValue"], dataset + #warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - warn "Only min value available for '#{feature.name}', entry ignored" + #warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - warn "Only max value available for '#{feature.name}', entry ignored" + #warn "Only max value available for '#{feature.name}', entry ignored" elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"], dataset_id - warn "loQualifier and upQualifier are empty." + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" - add_feature feature, v["loValue"], dataset_id - warn "loQualifier and upQualifier are empty." + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"], dataset_id - warn "loQualifier and upQualifier are empty." + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] - add_feature feature, [v["loValue"],v["upValue"]].mean, dataset_id - warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + add_feature feature, [v["loValue"],v["upValue"]].mean, dataset + #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] - warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." - add_feature feature, v["loValue"], dataset_id + #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + add_feature feature, v["loValue"], dataset elsif v == {} # do nothing else - warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + #warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." end end diff --git a/lib/regression.rb b/lib/regression.rb index 2eaae73..9d305a6 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,49 +3,43 @@ module OpenTox class Regression - def self.local_weighted_average compound, params + def self.local_weighted_average substance, neighbors weighted_sum = 0.0 sim_sum = 0.0 - neighbors = params[:neighbors] - neighbors.each do |row| - sim = row["tanimoto"] - sim ||= 1 # TODO: sim f nanoparticles - if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| - weighted_sum += sim*act - sim_sum += sim - end - end + neighbors.each do |neighbor| + sim = neighbor["similarity"] + activities = neighbor["toxicities"] + activities.each do |act| + weighted_sum += sim*act + sim_sum += sim + end if activities end sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" - neighbors = params[:neighbors] - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - activities = [] + def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05" + values = [] fingerprints = {} weights = [] - fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort - - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - fingerprint = neighbor.fingerprint - if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| - activities << act - weights << row["tanimoto"] - fingerprint_ids.each_with_index do |id,j| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) - end + fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each do |n| + fingerprint = Substance.find(n["_id"]).fingerprint + activities = n["toxicities"] + activities.each do |act| + values << act + weights << n["similarity"] + fingerprint_ids.each do |id| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) end - end + end if activities end variables = [] - data_frame = [activities] + data_frame = [values] + fingerprints.each do |k,v| unless v.uniq.size == 1 data_frame << v.collect{|m| m ? "T" : "F"} @@ -54,17 +48,16 @@ module OpenTox end if variables.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result - + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction else - compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction method, data_frame, variables, weights, compound_features + substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} + prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." + prediction else prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] prediction[:value] = prediction[:value] @@ -75,13 +68,10 @@ module OpenTox end - def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" - - neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities - - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" + def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" + #dataset = Dataset.find dataset_id activities = [] weights = [] pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq @@ -90,9 +80,11 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| + activities = neighbor["toxicities"] + activities.each do |act| data_frame[0][i] = act - n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + # TODO: update with cosine similarity for physchem + weights << n["similarity"] neighbor.physchem_descriptors.each do |pid,values| values = [values] unless values.is_a? Array values.uniq! @@ -101,7 +93,7 @@ module OpenTox data_frame[j] ||= [] data_frame[j][i] = values.for_R end - end + end if activities (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" @@ -117,12 +109,12 @@ module OpenTox end if pc_ids.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction else query_descriptors = pc_ids.collect do |i| - compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| @@ -135,9 +127,9 @@ module OpenTox end prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." + prediction else prediction end diff --git a/lib/substance.rb b/lib/substance.rb index 82ca65d..6768ce7 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -2,7 +2,6 @@ module OpenTox class Substance field :physchem_descriptors, type: Hash, default: {} - field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end diff --git a/lib/validation.rb b/lib/validation.rb index 334efd7..015e718 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -32,9 +32,12 @@ module OpenTox predictions = validation_model.predict test_set.substances predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 + p predictions.size predictions.each do |cid,prediction| + p prediction if prediction[:value] tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + p tox #prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][test_set.id.to_s] prediction[:measured] = tox[test_set.id.to_s] if tox else @@ -42,6 +45,7 @@ module OpenTox end predictions.delete(cid) unless prediction[:value] and prediction[:measured] end + p predictions.size validation = self.new( :model_id => validation_model.id, :test_dataset_id => test_set.id, -- cgit v1.2.3