nanoparticle models fixed

author: Christoph Helma <helma@in-silico.ch> 2016-05-09 15:11:46 +0200
committer: Christoph Helma <helma@in-silico.ch> 2016-05-09 15:11:46 +0200
commit: 611bac891177f8d9185d45486dd574b6ef4d1912 (patch)
tree: 4ebb62998deee6aa02f4a8b94c69bac226c27c27
parent: 7794086d367fb256c3673d7578b23ec2fb83e6ed (diff)
10 files changed, 72 insertions, 152 deletions
diff --git a/data/enm-dump.rb b/data/enm-dump.rb
deleted file mode 100644
index 88667fc..0000000
--- a/data/enm-dump.rb
+++ /dev/null
@@ -1,17 +0,0 @@
-require 'json'
-
-#get list of bundle URIs
-`wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
-json = JSON.parse File.read('./bundles.json')
-json["dataset"].each do |dataset|
-  uri = dataset["URI"]
-  id = uri.split("/").last
-  #`wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
-  `wget --header='accept:application/ld+json' '#{uri}/substance' -O 'study#{id}.json'`
-  #`wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
-  #`wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
-  #`wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
-  #`wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
-  #`wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
-  #`wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
-end
diff --git a/data/enm-import.rb b/data/enm-import.rb
deleted file mode 100644
index 37bc22b..0000000
--- a/data/enm-import.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-require_relative '../lib/lazar.rb'
-include OpenTox
-$mongo.database.drop
-$gridfs = $mongo.database.fs
-
-#get list of bundle URIs
-bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
-bundles.each do |bundle|
-  uri = bundle["URI"]
-  nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
-  features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"]
-  nanoparticles.each do |np|
-      nanoparticle = Nanoparticle.find_or_create_by(
-        :name => np["values"]["https://data.enanomapper.net/identifier/name"],
-        :source => np["compound"]["URI"],
-      )
-      nanoparticle.bundles << uri
-      nanoparticle.bundles.uniq!
-      np["composition"].each do |comp|
-        case comp["relation"]
-        when "HAS_CORE"
-          nanoparticle.core = comp["component"]["compound"]["URI"]
-        when "HAS_COATING"
-          nanoparticle.coating << comp["component"]["compound"]["URI"]
-        end
-      end if np["composition"]
-      np["values"].each do |u,v|
-        if u.match(/property/)
-          name, unit, source = nil
-          features.each do |uri,feat|
-            if u.match(/#{uri}/)
-              name = feat["title"]
-              unit = feat["units"]
-              source = uri
-            end
-          end
-          feature = Feature.find_or_create_by(
-            :name => name,
-            :unit => unit,
-            :source => source
-          )
-        end
-        v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array
-      end
-      nanoparticle.save!
-  end
-end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 86800c6..9738c1f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -62,12 +62,12 @@ module OpenTox
         training_cids = training_idxs.collect{|i| substance_ids[i]}
         chunk = [training_cids,test_cids].collect do |cids|
           dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id )
-          dataset.compounds.each do |compound|
-            compound.dataset_ids << dataset.id
-            compound.toxicities.each do |feature_id,data|
+          dataset.substances.each do |substance|
+            substance.dataset_ids << dataset.id
+            substance.toxicities.each do |feature_id,data|
               data[dataset.id.to_s] = data[self.id.to_s] # copy data entries
             end
-            compound.save
+            substance.save
           end
           dataset
         end
diff --git a/lib/import.rb b/lib/import.rb
index 11cb367..dfe5e2d 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -40,10 +40,10 @@ module OpenTox
             datasets[bundle_uri].substance_ids << nanoparticle.id
             nanoparticle["dataset_ids"] << datasets[bundle_uri].id
           end
+          bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1
           study["effects"].each do |effect|
             effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
             # TODO parse core/coating
-            # TODO parse proteomics, they come as a large textValue
             #$logger.debug File.join(np["compound"]["URI"],"study")
             effect["conditions"].delete_if { |k, v| v.nil? }
             # parse proteomics data
@@ -53,7 +53,7 @@ module OpenTox
                   :name => identifier,
                   :category => "Proteomics",
                 )
-                nanoparticle.parse_ambit_value feature, value
+                nanoparticle.parse_ambit_value feature, value, bundle
               end
             else
               feature = klass.find_or_create_by(
@@ -62,7 +62,7 @@ module OpenTox
                 :category => study["protocol"]["topcategory"],
                 :conditions => effect["conditions"]
               )
-              nanoparticle.parse_ambit_value feature, effect["result"]
+              nanoparticle.parse_ambit_value feature, effect["result"], bundle
             end
           end
           nanoparticle.save
diff --git a/lib/model.rb b/lib/model.rb
index 5b094fb..070248a 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -76,6 +76,7 @@ module OpenTox
         prediction = {}
         if neighbors.collect{|n| n["_id"]}.include? compound.id
 
+          me = neighbors.select{|n| n["_id"] == compound.id}.first
           database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq
           prediction[:database_activities] = database_activities
           prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 9bf419d..b79981d 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -9,10 +9,14 @@ module OpenTox
     field :proteomics, type: Hash, default: {}
 
     def nanoparticle_neighbors params
-      Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np}
+      dataset = Dataset.find(params[:training_dataset_id])
+      Dataset.find(params[:training_dataset_id]).nanoparticles.collect do |np|
+        np["tanimoto"] = 1
+        np unless np.toxicities.empty?
+      end.compact
     end
 
-    def add_feature feature, value
+    def add_feature feature, value, dataset_id
       case feature.category
       when "P-CHEM"
         physchem_descriptors[feature.id.to_s] ||= []
@@ -23,51 +27,52 @@ module OpenTox
         proteomics[feature.id.to_s] << value
         proteomics[feature.id.to_s].uniq!
       when "TOX"
-        toxicities[feature.id.to_s] ||= []
+        toxicities[feature.id.to_s] ||= {}
+        toxicities[feature.id.to_s][dataset_id.to_s] ||= []
         # TODO generic way of parsing TOX values
         if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" 
-          toxicities[feature.id.to_s] << -Math.log10(value)
+          toxicities[feature.id.to_s][dataset_id.to_s] << -Math.log10(value)
         else
-          toxicities[feature.id.to_s] << value
+          toxicities[feature.id.to_s][dataset_id.to_s] << value
         end
-        toxicities[feature.id.to_s].uniq!
+        toxicities[feature.id.to_s][dataset_id.to_s].uniq!
       else
         warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
       end
     end
 
-    def parse_ambit_value feature, v
+    def parse_ambit_value feature, v, dataset_id
       v.delete "unit"
       # TODO: ppm instead of weights
       if v.keys == ["textValue"]
-        add_feature feature, v["textValue"]
+        add_feature feature, v["textValue"], dataset_id
       elsif v.keys == ["loValue"]
-        add_feature feature, v["loValue"]
+        add_feature feature, v["loValue"], dataset_id
       elsif v.keys.size == 2 and v["errorValue"]
-        add_feature feature, v["loValue"]
+        add_feature feature, v["loValue"], dataset_id
         warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
       elsif v.keys.size == 2 and v["loQualifier"] == "mean"
-        add_feature feature, v["loValue"]
+        add_feature feature, v["loValue"], dataset_id
         warn "'#{feature.name}' is a mean value. Original data is not available."
       elsif v.keys.size == 2 and v["loQualifier"] #== ">="
         warn "Only min value available for '#{feature.name}', entry ignored"
       elsif v.keys.size == 2 and v["upQualifier"] #== ">="
         warn "Only max value available for '#{feature.name}', entry ignored"
       elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
-        add_feature feature, v["loValue"]
+        add_feature feature, v["loValue"], dataset_id
         warn "loQualifier and upQualifier are empty."
       elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
-        add_feature feature, v["loValue"]
+        add_feature feature, v["loValue"], dataset_id
         warn "loQualifier and upQualifier are empty."
       elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
-        add_feature feature, v["loValue"]
+        add_feature feature, v["loValue"], dataset_id
         warn "loQualifier and upQualifier are empty."
       elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
-        add_feature feature, [v["loValue"],v["upValue"]].mean
+        add_feature feature, [v["loValue"],v["upValue"]].mean, dataset_id
         warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
       elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
         warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
-        add_feature feature, v["loValue"]
+        add_feature feature, v["loValue"], dataset_id
       elsif v == {} # do nothing
       else
         warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
diff --git a/lib/regression.rb b/lib/regression.rb
index b8a7e5f..691f903 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -10,7 +10,7 @@ module OpenTox
         neighbors.each do |row|
           sim = row["tanimoto"]
           sim ||= 1 # TODO: sim f nanoparticles
-          if row["toxicities"][params[:prediction_feature_id].to_s]
+          if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
               weighted_sum += sim*act
               sim_sum += sim
diff --git a/scripts/import-enm.rb b/scripts/import-enm.rb
deleted file mode 100755
index 4fb414b..0000000
--- a/scripts/import-enm.rb
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env ruby
-require_relative '../lib/lazar'
-include OpenTox
-$mongo.database.drop
-$gridfs = $mongo.database.fs # recreate GridFS indexes
-Import::Enanomapper.import
-`mongodump -h 127.0.0.1 -d production`
diff --git a/scripts/mirror-enm2test.rb b/scripts/mirror-enm2test.rb
new file mode 100755
index 0000000..f6638bc
--- /dev/null
+++ b/scripts/mirror-enm2test.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative '../lib/lazar'
+include OpenTox
+Import::Enanomapper.mirror File.join(File.dirname(__FILE__),"..","test","data","enm")
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 6d91103..2082ec4 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -4,66 +4,12 @@ require_relative "setup.rb"
 class NanoparticleTest  < MiniTest::Test
 
   def setup
+    Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
     #`mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}`
   end
 
-  def test_mirror
-    Import::Enanomapper.mirror File.join(File.dirname(__FILE__),"..","data")
-  end
-
-  def test_import
-    Import::Enanomapper.import File.join(File.dirname(__FILE__),"..","data")
-#    skip
-#    dataset_ids = Import::Enanomapper.import
-#    assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
-#    assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
-#    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
-#    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
-#    p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
-#    dataset_ids.collect do |d|
-#      d = Dataset.find(d)
-      #p d.name
-      #puts d.to_csv
-#    end
-  end
-
-  def test_summaries
-    skip
-    features = Feature.all.to_a
-    #p features.collect do |f|
-      #f if f.category == "TOX"
-    #end.to_a.flatten.size
-    toxcounts = {}
-    pccounts = {}
-    Nanoparticle.all.each do |np|
-      np.toxicities.each do |t,v|
-        toxcounts[t] ||= 0
-        toxcounts[t] += 1#v.uniq.size
-      end
-      np.physchem_descriptors.each do |t,v|
-        pccounts[t] ||= 0
-        pccounts[t] += 1#v.uniq.size
-      end
-    end
-    #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml
-    #pccounts.each{|e,n| p Feature.find(e),n if n > 100}
-    #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq
-    toxcounts.each{|e,n| p Feature.find(e),n if n > 100}
-  end
-
-
-  def test_import_ld
-    skip
-    dataset_ids = Import::Enanomapper.import_ld
-  end
-
-  def test_export
-    Dataset.all.each do |d|
-      puts d.to_csv
-    end
-  end
-
   def test_create_model_with_feature_selection
+    skip
     training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
     feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
     model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
@@ -80,7 +26,6 @@ class NanoparticleTest  < MiniTest::Test
     feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
     model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
     nanoparticle = training_dataset.nanoparticles[-34]
-    #p nanoparticle.neighbors
     prediction = model.predict nanoparticle
     p prediction
     #p prediction
@@ -97,4 +42,40 @@ class NanoparticleTest  < MiniTest::Test
     p cv
   end
 
+  def test_export
+    skip
+    Dataset.all.each do |d|
+      puts d.to_csv
+    end
+  end
+
+  def test_summaries
+    skip
+    features = Feature.all.to_a
+    #p features.collect do |f|
+      #f if f.category == "TOX"
+    #end.to_a.flatten.size
+    toxcounts = {}
+    pccounts = {}
+    Nanoparticle.all.each do |np|
+      np.toxicities.each do |t,v|
+        toxcounts[t] ||= 0
+        toxcounts[t] += 1#v.uniq.size
+      end
+      np.physchem_descriptors.each do |t,v|
+        pccounts[t] ||= 0
+        pccounts[t] += 1#v.uniq.size
+      end
+    end
+    #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml
+    #pccounts.each{|e,n| p Feature.find(e),n if n > 100}
+    #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq
+    toxcounts.each{|e,n| p Feature.find(e),n if n > 100}
+  end
+
+
+  def test_import_ld
+    skip
+    dataset_ids = Import::Enanomapper.import_ld
+  end
 end
author	Christoph Helma <helma@in-silico.ch>	2016-05-09 15:11:46 +0200
committer	Christoph Helma <helma@in-silico.ch>	2016-05-09 15:11:46 +0200
commit	611bac891177f8d9185d45486dd574b6ef4d1912 (patch)
tree	4ebb62998deee6aa02f4a8b94c69bac226c27c27
parent	7794086d367fb256c3673d7578b23ec2fb83e6ed (diff)