From cfc64a2966ab38698e499f0b44f41208ee77a07f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 26 Apr 2016 17:38:15 +0200 Subject: first nanomaterial prediction --- data/enm-dump.rb | 15 ++++---- lib/import.rb | 18 +++++++++- lib/model.rb | 1 + lib/nanoparticle.rb | 2 ++ lib/overwrite.rb | 9 +++++ lib/regression.rb | 99 +++++++++++++++++++++++++++++++++++---------------- test/nanoparticles.rb | 40 ++++++++++++++++++--- test/setup.rb | 4 +-- test/validation.rb | 2 ++ 9 files changed, 146 insertions(+), 44 deletions(-) diff --git a/data/enm-dump.rb b/data/enm-dump.rb index c1c25e7..88667fc 100644 --- a/data/enm-dump.rb +++ b/data/enm-dump.rb @@ -6,11 +6,12 @@ json = JSON.parse File.read('./bundles.json') json["dataset"].each do |dataset| uri = dataset["URI"] id = uri.split("/").last - `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` - `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` - `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` - `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` - `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` - `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` - `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` + #`wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` + `wget --header='accept:application/ld+json' '#{uri}/substance' -O 'study#{id}.json'` + #`wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` + #`wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` + #`wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` + #`wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` + #`wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` + #`wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` end diff --git a/lib/import.rb b/lib/import.rb index 9091207..3c1edfe 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -30,7 +30,7 @@ module OpenTox $logger.debug File.join(np["compound"]["URI"],"study") effect["conditions"].delete_if { |k, v| v.nil? } feature = klass.find_or_create_by( - :source => File.join(np["compound"]["URI"],"study"), + #:source => File.join(np["compound"]["URI"],"study"), :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", :unit => effect["result"]["unit"], :category => study["protocol"]["topcategory"], @@ -48,6 +48,22 @@ module OpenTox datasets.collect{|d| d.id} end +=begin + def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries + #get list of bundle URIs + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + datasets = [] + bundles.each do |bundle| + uri = bundle["URI"] + study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`) + study["@graph"].each do |i| + puts i.to_yaml if i.keys.include? "sio:has-value" + end + end + datasets.collect{|d| d.id} + end +=end + def self.dump #get list of bundle URIs `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` diff --git a/lib/model.rb b/lib/model.rb index b82f098..45054e2 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -50,6 +50,7 @@ module OpenTox end def predict_compound compound + #p compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index b934bb3..b5de5b9 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -16,9 +16,11 @@ module OpenTox when "P-CHEM" physchem_descriptors[feature.id.to_s] ||= [] physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! when "TOX" toxicities[feature.id.to_s] ||= [] toxicities[feature.id.to_s] << value + toxicities[feature.id.to_s].uniq! else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end diff --git a/lib/overwrite.rb b/lib/overwrite.rb index cef5758..4a79051 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -114,6 +114,15 @@ class Array Math.sqrt(self.sample_variance) end + def for_R + if self.first.is_a?(String) + #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets + "NA" + else + self.median + end + end + end module URI diff --git a/lib/regression.rb b/lib/regression.rb index cb17f25..5610a77 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -75,46 +75,62 @@ module OpenTox end - def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" + def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" + + neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities - neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] - physchem = {} + pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq + data_frame = [] + data_frame[0] = [] neighbors.each_with_index do |n,i| - if n["toxicities"][params[:prediction_feature_id].to_s] - n["toxicities"][params[:prediction_feature_id].to_s].each do |act| - # TODO fix!!!! - activities << -Math.log10(act) - #if act.numeric? - #activities << act - n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor = Substance.find(n["_id"]) - neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity - physchem[pid] ||= [] - physchem[pid] += v - end + neighbor = Substance.find(n["_id"]) + n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + data_frame[0][i] = act + n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + neighbor.physchem_descriptors.each do |pid,values| + values.uniq! + warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1 + j = pc_ids.index(pid)+1 + data_frame[j] ||= [] + data_frame[j][i] = values.for_R end end + (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + data_frame[j] ||= [] + data_frame[j][i] ||= "NA" + end end - - # remove properties with a single value - physchem.each do |pid,v| - physchem.delete(pid) if v.uniq.size <= 1 + remove_idx = [] + data_frame.each_with_index do |r,i| + remove_idx << i if r.uniq.size == 1 # remove properties with a single value + end + remove_idx.reverse.each do |i| + data_frame.delete_at i + pc_ids.delete_at i end - if physchem.empty? + if pc_ids.empty? result = local_weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result - else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }} - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]} + query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R} + remove_idx = [] + query_descriptors.each_with_index do |v,i| + remove_idx << i if v == "NA" + end + remove_idx.reverse.each do |i| + data_frame.delete_at i + pc_ids.delete_at i + query_descriptors.delete_at i + end + prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." @@ -130,16 +146,39 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" - #p r_data_frame - File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"} +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) + File.open("tmp.R","w+"){|f| + f.puts "suppressPackageStartupMessages({ + library(iterators,lib=\"#{rlib}\") + library(foreach,lib=\"#{rlib}\") + library(ggplot2,lib=\"#{rlib}\") + library(grid,lib=\"#{rlib}\") + library(gridExtra,lib=\"#{rlib}\") + library(pls,lib=\"#{rlib}\") + library(caret,lib=\"#{rlib}\") + library(doMC,lib=\"#{rlib}\") + registerDoMC(#{NR_CORES}) +})" + + f.puts "data <- #{r_data_frame}\n" + f.puts "weights <- c(#{training_weights.join(', ')})" + f.puts "features <- c(#{training_features.join(', ')})" + f.puts "names(data) <- append(c('activities'),features)" # + f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" + f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + f.puts "names(fingerprint) <- features" + f.puts "prediction <- predict(model,fingerprint)" + } + R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # - begin - R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" - rescue - return nil - end + #begin + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" + #rescue + #return nil + #end + p query_feature_values R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 46073a9..31bb903 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -11,9 +11,38 @@ class NanoparticleTest < MiniTest::Test p dataset_ids.collect{|d| {d => Dataset.find(d).name}} dataset_ids.collect do |d| d = Dataset.find(d) - p d.name - puts d.to_csv + #p d.name + #puts d.to_csv + end + end + + def test_summaries + features = Feature.all.to_a + #p features.collect do |f| + #f if f.category == "TOX" + #end.to_a.flatten.size + toxcounts = {} + pccounts = {} + Nanoparticle.all.each do |np| + np.toxicities.each do |t,v| + toxcounts[t] ||= 0 + toxcounts[t] += 1#v.uniq.size + end + np.physchem_descriptors.each do |t,v| + pccounts[t] ||= 0 + pccounts[t] += 1#v.uniq.size + end end + #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml + #pccounts.each{|e,n| p Feature.find(e),n if n > 100} + #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq + toxcounts.each{|e,n| p Feature.find(e),n if n > 100} + end + + + def test_import_ld + skip + dataset_ids = Import::Enanomapper.import_ld end def test_export @@ -24,11 +53,14 @@ class NanoparticleTest < MiniTest::Test def test_create_model training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors") + feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)") + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"}) nanoparticle = training_dataset.nanoparticles[-34] + #p nanoparticle.neighbors prediction = model.predict nanoparticle p prediction - refute_nil prediction[:value] + #p prediction + #refute_nil prediction[:value] end end diff --git a/test/setup.rb b/test/setup.rb index e7c32b4..6c97282 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -$mongo.database.drop -$gridfs = $mongo.database.fs +#$mongo.database.drop +#$gridfs = $mongo.database.fs diff --git a/test/validation.rb b/test/validation.rb index baee2d1..cbc7d09 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -9,6 +9,7 @@ class ValidationTest < MiniTest::Test model = Model::LazarClassification.create dataset.features.first, dataset cv = ClassificationCrossValidation.create model assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split" + assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})." end def test_default_regression_crossvalidation @@ -85,6 +86,7 @@ class ValidationTest < MiniTest::Test assert_equal 14, loo.nr_unpredicted refute_empty loo.confusion_matrix assert loo.accuracy > 0.77 + assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})." end def test_regression_loo_validation -- cgit v1.2.3