From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 27 May 2016 19:16:16 +0200 Subject: first correlation of nanoparticle predictions --- lib/compound.rb | 30 ----------- lib/crossvalidation.rb | 2 +- lib/dataset.rb | 11 ++-- lib/import.rb | 57 +++++++++++++++------ lib/leave-one-out-validation.rb | 3 +- lib/model.rb | 16 +++--- lib/nanoparticle.rb | 110 +++++++++++++++++++++++++++++++--------- lib/regression.rb | 2 - lib/similarity.rb | 2 +- lib/validation-statistics.rb | 13 ++--- 10 files changed, 149 insertions(+), 97 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 89e9db2..a87678e 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -254,36 +254,6 @@ module OpenTox self["chemblid"] end -=begin - def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) - neighbors = [] - dataset = Dataset.find(dataset_id) - query_fingerprint = self.fingerprint type - dataset.compounds.each do |compound| - values = dataset.values(compound,prediction_feature_id) - if values - candidate_fingerprint = compound.fingerprint type -# features = (query_fingerprint + candidate_fingerprint).uniq -# min_sum = 0 -# max_sum = 0 -# features.each do |f| -# min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax -# min_sum += min -# max_sum += max -# end -# max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f -# neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] -# end -# end -# neighbors.sort{|a,b| b.last <=> a.last} - sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) - neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim - end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end -=end - def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] dataset = Dataset.find(dataset_id) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 357f0fa..420dd8c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -168,7 +168,7 @@ module OpenTox def correlation_plot unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot predictions + plot_id = ValidationStatistics.correlation_plot id, predictions update(:correlation_plot_id => plot_id) end $gridfs.find_one(_id: correlation_plot_id).data diff --git a/lib/dataset.rb b/lib/dataset.rb index 9138452..0c65d61 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -46,11 +46,8 @@ module OpenTox feature = feature.id if feature.is_a? Feature data_entries[substance.to_s] ||= {} data_entries[substance.to_s][feature.to_s] ||= [] - if value.is_a? Array - data_entries[substance.to_s][feature.to_s] += value - else - data_entries[substance.to_s][feature.to_s] << value - end + data_entries[substance.to_s][feature.to_s] << value + #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source end # Dataset operations @@ -75,6 +72,7 @@ module OpenTox dataset = self.class.create(:source => self.id ) substances.each do |substance| substance.dataset_ids << dataset.id + substance.dataset_ids.uniq! substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end @@ -200,7 +198,8 @@ module OpenTox next end all_substances << substance - substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id + substance.dataset_ids << self.id + substance.dataset_ids.uniq! substance.save unless vals.size == features.size diff --git a/lib/import.rb b/lib/import.rb index 2dcc361..80d4579 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -10,15 +10,15 @@ module OpenTox bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} bundles.each do |bundle| - p bundle["title"] + $logger.debug bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - p nanoparticles.size + $logger.debug nanoparticles.size nanoparticles.each do |nanoparticle| uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] $logger.debug uuid File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] - p uuid if studies.size < 1 + $logger.debug uuid if studies.size < 1 studies.each do |study| File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} end @@ -27,35 +27,58 @@ module OpenTox end def self.import dir="." + start_time = Time.now + t1 = 0 + t2 = 0 datasets = {} JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) end Dir[File.join(dir,"study*.json")].each do |s| + t = Time.now study = JSON.parse(File.read(s)) np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) + core = {} + coating = [] + np["composition"].each do |c| + if c["relation"] == "HAS_CORE" + core = { + :uri => c["component"]["compound"]["URI"], + :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + } + elsif c["relation"] == "HAS_COATING" + coating << { + :uri => c["component"]["compound"]["URI"], + :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + } + end + end if np["composition"] nanoparticle = Nanoparticle.find_or_create_by( :name => np["values"]["https://data.enanomapper.net/identifier/name"], :source => np["compound"]["URI"], + :core => core, + :coating => coating ) np["bundles"].keys.each do |bundle_uri| - nanoparticle["dataset_ids"] << datasets[bundle_uri].id + nanoparticle.dataset_ids << datasets[bundle_uri].id end - bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1 + dataset = datasets[np["bundles"].keys.first] + proteomics_features = {} study["effects"].each do |effect| effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - # TODO parse core/coating - #$logger.debug File.join(np["compound"]["URI"],"study") effect["conditions"].delete_if { |k, v| v.nil? } - # parse proteomics data - if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data +=begin JSON.parse(effect["result"]["textValue"]).each do |identifier, value| - feature = klass.find_or_create_by( - :name => identifier, - :category => "Proteomics", - ) - nanoparticle.parse_ambit_value feature, value, bundle + # time critical step + t = Time.now + proteomics_features[identifier] ||= klass.find_or_create_by(:name => identifier, :category => "Proteomics") + t1 += Time.now - t + t = Time.now + nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset + t2 += Time.now - t end +=end else feature = klass.find_or_create_by( :name => effect["endpoint"], @@ -63,10 +86,14 @@ module OpenTox :category => study["protocol"]["topcategory"], :conditions => effect["conditions"] ) - nanoparticle.parse_ambit_value feature, effect["result"], bundle + nanoparticle.parse_ambit_value feature, effect["result"], dataset end end nanoparticle.save + #p "Total time: #{Time.now - start_time}" + #p "Proteomics features: #{t1}" + #p "Proteomics values: #{t2}" + #p "Time2: #{t2}" end datasets.each { |u,d| d.save } end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index b8deae9..9698e05 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -86,7 +86,6 @@ module OpenTox class RegressionLeaveOneOutValidation < LeaveOneOutValidation - include Plot field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 @@ -101,7 +100,7 @@ module OpenTox def correlation_plot unless correlation_plot_id - #plot_id = correlation_plot + plot_id = ValidationStatistics.correlation_plot id, predictions update(:correlation_plot_id => plot_id) end $gridfs.find_one(_id: correlation_plot_id).data diff --git a/lib/model.rb b/lib/model.rb index 3a178a1..18d621b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -32,12 +32,13 @@ module OpenTox self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id - Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm + #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm save self end def correlation_filter + self.relevant_features = {} toxicities = [] substances = [] training_dataset.substances.each do |s| @@ -49,23 +50,22 @@ module OpenTox R.assign "tox", toxicities feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} R.assign "feature", feature_values begin - #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" pvalue = R.eval("cor$p.value").to_ruby if pvalue <= 0.05 r = R.eval("cor$estimate").to_ruby - relevant_features[feature] = {} - relevant_features[feature]["pvalue"] = pvalue - relevant_features[feature]["r"] = r + self.relevant_features[feature_id] = {} + self.relevant_features[feature_id]["pvalue"] = pvalue + self.relevant_features[feature_id]["r"] = r end rescue warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." end end - relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h end def predict_substance substance diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 7890a19..5c6d944 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -3,12 +3,11 @@ module OpenTox class Nanoparticle < Substance include OpenTox - field :core, type: String + field :core, type: Hash, default: {} field :coating, type: Array, default: [] - field :bundles, type: Array, default: [] field :proteomics, type: Hash, default: {} - def nanoparticle_neighbors min_sim: 0.1, type:, dataset_id:, prediction_feature_id: + def nanoparticle_neighbors_old min_sim: 0.9, type:, dataset_id:, prediction_feature_id: dataset = Dataset.find(dataset_id) neighbors = [] dataset.nanoparticles.each do |np| @@ -25,33 +24,96 @@ module OpenTox neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} neighbors end - - def add_feature feature, value, dataset_id + + def nanoparticle_neighbors min_sim: 0.9, type:, dataset_id:, prediction_feature_id: + p self.name + #p self.physchem_descriptors.keys.size dataset = Dataset.find(dataset_id) - case feature.category - when "P-CHEM" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! - when "Proteomics" - proteomics[feature.id.to_s] ||= [] - proteomics[feature.id.to_s] << value - proteomics[feature.id.to_s].uniq! - when "TOX" - # TODO generic way of parsing TOX values - if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" - dataset.add self, feature, -Math.log10(value) + relevant_features = {} + toxicities = [] + substances = [] + # TODO: exclude query activities!!! + dataset.substances.each do |s| + dataset.values(s,prediction_feature_id).each do |act| + toxicities << act + substances << s + end + end + R.assign "tox", toxicities + feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature} + # identify relevant features + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature_id] = {} + relevant_features[feature_id]["pvalue"] = pvalue + relevant_features[feature_id]["r"] = r + relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + end + end + neighbors = [] + substances.each do |substance| + values = dataset.values(substance,prediction_feature_id) + if values + common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys + # scale values + query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + #weights = common_descriptors.collect{|d| 1-relevant_features[d]["pvalue"]} + weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} + #p weights + sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) + ##p "SIM" + #p [sim, Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)] + neighbors << {"_id" => substance.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + end + end + p neighbors.size + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + neighbors + end + + def add_feature feature, value, dataset + unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand + case feature.category + when "P-CHEM" + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! + when "Proteomics" + proteomics[feature.id.to_s] ||= [] + proteomics[feature.id.to_s] << value + proteomics[feature.id.to_s].uniq! + when "TOX" + # TODO generic way of parsing TOX values + if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" + dataset.add self, feature, Math.log2(value) + elsif feature.name == "Total protein (BCA assay)" + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! + else + dataset.add self, feature, value + end + dataset.save + dataset_ids << dataset.id + dataset_ids.uniq! else - dataset.add self, feature, value + warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end - dataset.save - else - warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end end - def parse_ambit_value feature, v, dataset_id - dataset = Dataset.find(dataset_id) + def parse_ambit_value feature, v, dataset v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 9d305a6..6487557 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -71,7 +71,6 @@ module OpenTox #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" - #dataset = Dataset.find dataset_id activities = [] weights = [] pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq @@ -83,7 +82,6 @@ module OpenTox activities = neighbor["toxicities"] activities.each do |act| data_frame[0][i] = act - # TODO: update with cosine similarity for physchem weights << n["similarity"] neighbor.physchem_descriptors.each do |pid,values| values = [values] unless values.is_a? Array diff --git a/lib/similarity.rb b/lib/similarity.rb index f25d4c3..00179c1 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -38,7 +38,7 @@ module OpenTox magnitude_a += w[i].abs*a[i]**2 magnitude_b += w[i].abs*b[i]**2 end - dot_product/Math.sqrt(magnitude_a*magnitude_b) + dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 156353a..e61543b 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -83,7 +83,7 @@ module OpenTox end R.assign "measurement", x R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" + R.eval "r <- cor(measurement,prediction,use='pairwise')" r = R.eval("r").to_ruby mae = mae/predictions.size @@ -99,11 +99,7 @@ module OpenTox } end - end - - module Plot - - def plot_id + def self.correlation_plot id, predictions tmpfile = "/tmp/#{id.to_s}_correlation.png" x = [] y = [] @@ -115,10 +111,11 @@ module OpenTox R.assign "prediction", y R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(prediction,measurement,main='',asp=1,xlim=range, ylim=range)" + # TODO units + R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") plot_id = $gridfs.insert_one(file) plot_id end -- cgit v1.2.3