From 753fcc204d93d86c76860bee6e2f7d0468c3c940 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 14 Apr 2016 19:43:24 +0200 Subject: features/toxicities fixed --- .gitignore | 1 + lib/classification.rb | 2 +- lib/compound.rb | 6 ++---- lib/dataset.rb | 29 +++++++++++++++++++++-------- lib/model.rb | 35 ++++++++++++++++------------------- lib/nanoparticle.rb | 30 +++++++++++++++++++----------- lib/opentox.rb | 5 +++++ lib/regression.rb | 35 ++++++++++++++++++++--------------- lib/substance.rb | 1 + test/classification.rb | 14 +++++++------- test/nanoparticles.rb | 23 ++++++++++++++++++----- test/setup.rb | 4 ++-- 12 files changed, 113 insertions(+), 72 deletions(-) diff --git a/.gitignore b/.gitignore index fb51df7..6e0f374 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +R openbabel Gemfile.lock *.gem diff --git a/lib/classification.rb b/lib/classification.rb index 0202940..4a17546 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -10,7 +10,7 @@ module OpenTox confidence = 0.0 neighbors.each do |row| sim = row["tanimoto"] - row["features"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum[act] ||= 0 weighted_sum[act] += sim end diff --git a/lib/compound.rb b/lib/compound.rb index 7895619..55cd482 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -17,8 +17,6 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer - # TODO separate between physchem, bio and tox - field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) @@ -291,7 +289,7 @@ module OpenTox candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end @@ -332,7 +330,7 @@ module OpenTox 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - 'features' => 1, + 'toxicities' => 1, 'dataset_ids' => 1 }}, {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, diff --git a/lib/dataset.rb b/lib/dataset.rb index 25307c9..274c475 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -13,6 +13,10 @@ module OpenTox substances.select{|s| s.is_a? Compound} end + def nanoparticles + substances.select{|s| s.is_a? Nanoparticle} + end + # Get all substances def substances @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} @@ -21,7 +25,7 @@ module OpenTox # Get all features def features - @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact @features end @@ -98,13 +102,22 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + compound = Substance.find(data_entries.first.first).is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end data_entries.each do |sid,f| - substance = Substance.find cid + substance = Substance.find sid features.each do |feature| - f[feature.id].each do |v| - csv << [inchi ? substance.inchi : substance.smiles , v] - end + f[feature.id.to_s].each do |v| + if compound + csv << [inchi ? substance.inchi : substance.smiles , v] + else + csv << [substance.name , v] + end + end if f[feature.id.to_s] end end end @@ -221,8 +234,8 @@ module OpenTox self.data_entries[compound.id.to_s] ||= {} self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - compound.features[@features[j].id.to_s] ||= [] - compound.features[@features[j].id.to_s] << v + compound.toxicities[@features[j].id.to_s] ||= [] + compound.toxicities[@features[j].id.to_s] << v compound.save end end diff --git a/lib/model.rb b/lib/model.rb index 5140d5a..1960c10 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -36,6 +36,7 @@ module OpenTox super params # TODO document convention + #p training_dataset.features prediction_feature = training_dataset.features.first # set defaults for empty parameters self.prediction_feature_id ||= prediction_feature.id @@ -56,12 +57,13 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq + #TODO restrict to dataset features + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } + neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -78,12 +80,11 @@ module OpenTox # parse data compounds = [] - case object.class.to_s - when "OpenTox::Compound" + if object.is_a? Substance compounds = [object] - when "Array" + elsif object.is_a? Array compounds = object - when "OpenTox::Dataset" + elsif object.is_a? Dataset compounds = object.compounds else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." @@ -97,30 +98,26 @@ module OpenTox end # serialize result - case object.class.to_s - when "OpenTox::Compound" + if object.is_a? Substance prediction = predictions[compounds.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity + return prediction + elsif object.is_a? Array return predictions - when "Array" - return predictions - when "OpenTox::Dataset" + elsif object.is_a? Dataset predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.new( + prediction_dataset = LazarPrediction.create( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, - :prediction_feature_id => prediction_feature.id - + :prediction_feature_id => prediction_feature.id, + :predictions => predictions ) - compounds.each_with_index do |c,i| - prediction_dataset.predictions[c.id.to_s] = predictions[i] - end - prediction_dataset.save + #prediction_dataset.save return prediction_dataset end @@ -264,7 +261,7 @@ module OpenTox training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq query_features = nanoparticle.physchem_descriptors.keys common_features = (training_features & query_features) - p common_features + #p common_features end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 6e9b0ea..0350363 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -5,12 +5,10 @@ module OpenTox field :core, type: String field :coating, type: Array, default: [] - - field :toxicities, type: Hash, default: {} - #field :features, type: Hash, default: {} field :bundles, type: Array, default: [] - def predict + def nanoparticle_neighbors params + Dataset.find(params[:training_dataset_id]).nanoparticles end def add_feature feature, value @@ -21,22 +19,32 @@ module OpenTox toxicities[feature.id.to_s] ||= [] toxicities[feature.id.to_s] << value else - $logger.warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." - warnings << "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." + warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." end end def parse_ambit_value feature, v + # TODO: units, mmol/log10 conversion if v.keys == ["loValue"] - add_feature feature, v["loValue"] + #if v["loValue"].numeric? + add_feature feature, v["loValue"] + #else + #warn "'#{v["loValue"]}' is not a numeric value, entry ignored." + #end elsif v.keys.size == 2 and v["loQualifier"] == "mean" - add_feature feature, {:mean => v["loValue"]} + #add_feature feature, {:mean => v["loValue"]} + add_feature feature, v["loValue"] + warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - add_feature feature, {:min => v["loValue"],:max => Float::INFINITY} + #add_feature feature, {:min => v["loValue"],:max => Float::INFINITY} + warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY} + #add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY} + warn "Only max value available for '#{feature.name}', entry ignored" elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] - add_feature feature, {:min => v["loValue"],:max => v["upValue"]} + #add_feature feature, {:min => v["loValue"],:max => v["upValue"]} + add_feature feature, [v["loValue"],v["upValue"]].mean + warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v == {} # do nothing else $logger.warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." diff --git a/lib/opentox.rb b/lib/opentox.rb index cc18cc6..7d8a8a2 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -15,6 +15,11 @@ module OpenTox field :name, type: String field :source, type: String field :warnings, type: Array, default: [] + + def warn warning + $logger.warn warning + warnings << warning + end end OpenTox.const_set klass,c end diff --git a/lib/regression.rb b/lib/regression.rb index 5021fb3..cb17f25 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,8 +9,8 @@ module OpenTox neighbors = params[:neighbors] neighbors.each do |row| sim = row["tanimoto"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) sim_sum += sim end @@ -32,8 +32,8 @@ module OpenTox neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| activities << Math.log10(act) weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -79,21 +79,24 @@ module OpenTox neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] physchem = {} - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] # TODO cosine ? - neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + neighbors.each_with_index do |n,i| + if n["toxicities"][params[:prediction_feature_id].to_s] + n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + # TODO fix!!!! + activities << -Math.log10(act) + #if act.numeric? + #activities << act + n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + neighbor = Substance.find(n["_id"]) + neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity physchem[pid] ||= [] - physchem[pid] << v + physchem[pid] += v end end end @@ -110,8 +113,8 @@ module OpenTox return result else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }} + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]} if prediction.nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." @@ -127,6 +130,8 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + #p r_data_frame + File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"} R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # diff --git a/lib/substance.rb b/lib/substance.rb index 6768ce7..82ca65d 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -2,6 +2,7 @@ module OpenTox class Substance field :physchem_descriptors, type: Hash, default: {} + field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end diff --git a/test/classification.rb b/test/classification.rb index af23db6..7412714 100644 --- a/test/classification.rb +++ b/test/classification.rb @@ -30,14 +30,14 @@ class LazarClassificationTest < MiniTest::Test # make a dataset prediction compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv") - prediction = model.predict compound_dataset - assert_equal compound_dataset.compounds, prediction.compounds + prediction_dataset = model.predict compound_dataset + assert_equal compound_dataset.compounds, prediction_dataset.compounds - cid = prediction.compounds[7].id.to_s - assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.predictions[cid][:warning] - cid = prediction.compounds[9].id.to_s - assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.predictions[cid][:warning] + cid = prediction_dataset.compounds[7].id.to_s + assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning] + cid = prediction_dataset.compounds[9].id.to_s + assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction_dataset.predictions[cid][:warning] # cleanup - [training_dataset,model,compound_dataset].each{|o| o.delete} + [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} end end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 6f241ec..46073a9 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -6,16 +6,29 @@ class NanoparticleTest < MiniTest::Test dataset_ids = Import::Enanomapper.import assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported" - p dataset_ids.collect{|d| Dataset.find(d).name} assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki") assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + p dataset_ids.collect{|d| {d => Dataset.find(d).name}} + dataset_ids.collect do |d| + d = Dataset.find(d) + p d.name + puts d.to_csv + end end - def test_create_model - Model::NanoLazar.create_all.each do |model| - np = Nanoparticle.find(model.training_particle_ids.sample) - model.predict np + def test_export + Dataset.all.each do |d| + puts d.to_csv end end + def test_create_model + training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors") + nanoparticle = training_dataset.nanoparticles[-34] + prediction = model.predict nanoparticle + p prediction + refute_nil prediction[:value] + end + end diff --git a/test/setup.rb b/test/setup.rb index e7c32b4..6c97282 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -$mongo.database.drop -$gridfs = $mongo.database.fs +#$mongo.database.drop +#$gridfs = $mongo.database.fs -- cgit v1.2.3