From b6116bc4705066da30668ff3370f3b1c307e44e7 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 11 Nov 2016 13:07:53 +0100
Subject: enm import fixed

---
 lazar.gemspec                          |   1 -
 lib/import.rb                          | 194 ++++++++++++++-------------------
 lib/model.rb                           |  21 +---
 test/descriptor.rb                     |   1 -
 test/model-nanoparticle.rb             |   1 -
 test/nanomaterial-prediction-models.rb |   1 -
 test/setup.rb                          |   4 +
 test/validation-nanoparticle.rb        |  43 ++++----
 test/validation-regression.rb          |   1 -
 9 files changed, 106 insertions(+), 161 deletions(-)

diff --git a/lazar.gemspec b/lazar.gemspec
index a805edb..dfdaac8 100644
--- a/lazar.gemspec
+++ b/lazar.gemspec
@@ -24,5 +24,4 @@ Gem::Specification.new do |s|
   s.add_runtime_dependency 'rserve-client', '~> 0.3'
   s.add_runtime_dependency 'mongoid', '~> 5.0'
   s.add_runtime_dependency 'openbabel', '~> 2.3', '>= 2.3.2.2'
-
 end
diff --git a/lib/import.rb b/lib/import.rb
index 8f640b1..aa2ee75 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -5,129 +5,95 @@ module OpenTox
     class Enanomapper
       include OpenTox
 
-      def self.mirror dir=nil
-        # clean download dir
-        dir ||= File.join(File.dirname(__FILE__),"..","data","enm")
-        FileUtils.rm_rf dir
-        FileUtils.mkdir_p dir
-
-        #get list of bundle URIs
+      # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
+      def self.import dir="."
+        datasets = {}
         bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
-        File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)}
-        # bundles
-          # id/summary
-          # id/compound
-          # id/substance
-          # id/property
-
         bundles.each do |bundle|
+          datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
           $logger.debug bundle["title"]
           nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
-          $logger.debug nanoparticles.size
-          nanoparticles.each do |nanoparticle|
-            uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"]
-            $logger.debug uuid
-            File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)}
-            studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"]
-            $logger.debug uuid if studies.size < 1 
-            studies.each do |study|
-              File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)}
-            end
-          end
-        end
-      end
-
-      def self.import dir="."
-        start_time = Time.now
-        t1 = 0
-        t2 = 0
-        datasets = {}
-        JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle|
-          if bundle["id"] == 3
-          datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
-          end
-        end
-        # TODO this is only for protein corona
-        Dir[File.join(dir,"study-F*.json")].each do |s|
-          t = Time.now
-          study = JSON.parse(File.read(s))
-          np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json")))
-          core_id = nil
-          coating_ids = []
-          np["composition"].each do |c|
-            uri = c["component"]["compound"]["URI"]
-            uri = CGI.escape File.join(uri,"&media=application/json")
-            data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
-            smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
-            names = []
-            names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
-            names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
-            if smiles
-              compound = Compound.find_or_create_by(:smiles => smiles)
-              compound.names = names.compact
-            else
-              compound = Compound.find_or_create_by(:names => names)
-            end
-            compound.save
-            if c["relation"] == "HAS_CORE"
-              core_id = compound.id.to_s
-            elsif c["relation"] == "HAS_COATING"
-              coating_ids << compound.id.to_s
+          nanoparticles.each_with_index do |np,n|
+            core_id = nil
+            coating_ids = []
+            np["composition"].each do |c|
+              uri = c["component"]["compound"]["URI"]
+              uri = CGI.escape File.join(uri,"&media=application/json")
+              data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
+              smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
+              names = []
+              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
+              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
+              if smiles
+                compound = Compound.find_or_create_by(:smiles => smiles)
+                compound.name = names.first
+                compound.names = names.compact
+              else
+                compound = Compound.find_or_create_by(:name => names.first,:names => names)
+              end
+              compound.save
+              if c["relation"] == "HAS_CORE"
+                core_id = compound.id.to_s
+              elsif c["relation"] == "HAS_COATING"
+                coating_ids << compound.id.to_s
+              end
+            end if np["composition"]
+            nanoparticle = Nanoparticle.find_or_create_by(
+              :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+              :source => np["compound"]["URI"],
+              :core_id => core_id,
+              :coating_ids => coating_ids
+            )
+            np["bundles"].keys.each do |bundle_uri|
+              nanoparticle.dataset_ids << datasets[bundle_uri].id
             end
-          end if np["composition"]
-          nanoparticle = Nanoparticle.find_or_create_by(
-            :name => np["values"]["https://data.enanomapper.net/identifier/name"],
-            :source => np["compound"]["URI"],
-            :core_id => core_id,
-            :coating_ids => coating_ids
-          )
-          np["bundles"].keys.each do |bundle_uri|
-            nanoparticle.dataset_ids << datasets[bundle_uri].id
-          end
 
-          dataset = datasets[np["bundles"].keys.first]
-          proteomics_features = {}
-          category = study["protocol"]["topcategory"]
-          source = study["protocol"]["category"]["term"]
-
-          study["effects"].each do |effect|
-
-            effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
-            effect["conditions"].delete_if { |k, v| v.nil? }
-
-            if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-
-              JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
-                proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
-                nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
-              end
-            else
-              name = effect["endpoint"]
-              unit = effect["result"]["unit"]
-              warnings = []
-              case name
-              when "Log2 transformed" # use a sensible name
-                name = "log2(Net cell association)"
-                warnings = ["Original name was 'Log2 transformed'"]
-                unit = "log2(mL/ug(Mg))"
-              when "Total protein (BCA assay)"
-                category = "P-CHEM"
-                warnings = ["Category changed from TOX to P-CHEM"]
+            studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
+            studies.each do |study|
+              dataset = datasets[np["bundles"].keys.first]
+              proteomics_features = {}
+              category = study["protocol"]["topcategory"]
+              source = study["protocol"]["category"]["term"]
+              study["effects"].each do |effect|
+
+                effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
+                effect["conditions"].delete_if { |k, v| v.nil? }
+
+                if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
+
+                  JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
+                    proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
+                    nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
+                  end
+                else
+                  name = effect["endpoint"]
+                  unit = effect["result"]["unit"]
+                  warnings = []
+                  case name
+                  when "Log2 transformed" # use a sensible name
+                    name = "log2(Net cell association)"
+                    warnings = ["Original name was 'Log2 transformed'"]
+                    unit = "log2(mL/ug(Mg))"
+                  when "Total protein (BCA assay)"
+                    category = "P-CHEM"
+                    warnings = ["Category changed from TOX to P-CHEM"]
+                  end
+                  feature = klass.find_or_create_by(
+                    :name => name,
+                    :unit => unit,
+                    :category => category,
+                    :conditions => effect["conditions"],
+                    :source => study["protocol"]["category"]["term"],
+                    :measured => true,
+                    :warnings => warnings
+                  )
+                  nanoparticle.parse_ambit_value feature, effect["result"], dataset
+                end
               end
-              feature = klass.find_or_create_by(
-                :name => name,
-                :unit => unit,
-                :category => category,
-                :conditions => effect["conditions"],
-                :source => study["protocol"]["category"]["term"],
-                :measured => true,
-                :warnings => warnings
-              )
-              nanoparticle.parse_ambit_value feature, effect["result"], dataset
             end
+            nanoparticle.save
+            print "#{n}, "
           end
-    p nanoparticle
-          nanoparticle.save
         end
         datasets.each { |u,d| d.save }
       end
diff --git a/lib/model.rb b/lib/model.rb
index 809dc48..9be0fa0 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -152,10 +152,7 @@ module OpenTox
           categories.each do |category|
             Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
           end
-          #p feature_ids
-          #properties = Nanoparticle.all.collect { |s| p s.name; p s.id; p s.properties }
           properties = model.substances.collect { |s| s.properties  }
-          #p properties
           property_ids = properties.collect{|p| p.keys}.flatten.uniq
           model.descriptor_ids = feature_ids & property_ids
           model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
@@ -223,10 +220,10 @@ module OpenTox
             prediction[:measurements] << dependent_variables[i]
             prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
           else
-            next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core
             if fingerprints?
               neighbor_descriptors = fingerprints[i]
             else
+              next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
               neighbor_descriptors = scaled_variables.collect{|v| v[i]}
             end
             sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
@@ -344,7 +341,6 @@ module OpenTox
       field :unit, type: String
       field :model_id, type: BSON::ObjectId
       field :repeated_crossvalidation_id, type: BSON::ObjectId
-      #field :leave_one_out_validation_id, type: BSON::ObjectId
 
       def predict object
         model.predict object
@@ -370,10 +366,6 @@ module OpenTox
         repeated_crossvalidation.crossvalidations
       end
 
-      def leave_one_out_validation
-        Validation::LeaveOneOut.find leave_one_out_validation_id
-      end
-
       def regression?
         model.is_a? LazarRegression
       end
@@ -390,7 +382,6 @@ module OpenTox
         model = Lazar.create training_dataset: training_dataset
         prediction_model[:model_id] = model.id
         prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id
-        #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id
         prediction_model.save
         prediction_model
       end
@@ -406,12 +397,7 @@ module OpenTox
         unless training_dataset # try to import from json dump
           Import::Enanomapper.import
           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-          unless training_dataset
-            Import::Enanomapper.mirror
-            Import::Enanomapper.import
-            training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-            bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
-          end
+          bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
         end
         prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
 
@@ -424,8 +410,7 @@ module OpenTox
         model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms)
         prediction_model[:model_id] = model.id
         repeated_cv = Validation::RepeatedCrossValidation.create model
-        prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id
-        #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id
+        prediction_model[:repeated_crossvalidation_id] = repeated_cv.id
         prediction_model.save
         prediction_model
       end
diff --git a/test/descriptor.rb b/test/descriptor.rb
index 6eb4316..563cdce 100644
--- a/test/descriptor.rb
+++ b/test/descriptor.rb
@@ -6,7 +6,6 @@ class DescriptorTest < MiniTest::Test
     # check available descriptors
     assert_equal 15,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors"
     assert_equal 45,PhysChem.joelib_descriptors.size,"incorrect number of Joelib descriptors"
-    p PhysChem.cdk_descriptors
     assert_equal 286,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors"
     assert_equal 346,PhysChem.descriptors.size,"incorrect number of physchem descriptors"
   end
diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb
index c5f3223..7fb944e 100644
--- a/test/model-nanoparticle.rb
+++ b/test/model-nanoparticle.rb
@@ -108,7 +108,6 @@ class NanoparticleModelTest  < MiniTest::Test
       },
     }
     model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
-    p model
     refute_empty model.dependent_variables
     refute_empty model.descriptor_ids
     refute_empty model.independent_variables
diff --git a/test/nanomaterial-prediction-models.rb b/test/nanomaterial-prediction-models.rb
index b0c05f3..f90a822 100644
--- a/test/nanomaterial-prediction-models.rb
+++ b/test/nanomaterial-prediction-models.rb
@@ -13,7 +13,6 @@ class NanomaterialPredictionModelTest < MiniTest::Test
 
   def test_default_nanomaterial_prediction_model
     prediction_model = Model::NanoPrediction.create
-    p prediction_model
     [:endpoint,:species,:source].each do |p|
       refute_empty prediction_model[p]
     end
diff --git a/test/setup.rb b/test/setup.rb
index 6c97282..63b59fb 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,9 @@ require_relative '../lib/lazar.rb'
 include OpenTox
 TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
 DATA_DIR ||= File.join(TEST_DIR,"data")
+training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+unless training_dataset
+  Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
+end
 #$mongo.database.drop
 #$gridfs = $mongo.database.fs
diff --git a/test/validation-nanoparticle.rb b/test/validation-nanoparticle.rb
index 5ed70f2..9351e1b 100644
--- a/test/validation-nanoparticle.rb
+++ b/test/validation-nanoparticle.rb
@@ -5,74 +5,72 @@ class NanoparticleValidationTest  < MiniTest::Test
 
   def setup
     @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-    unless @training_dataset
-      Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
-      @training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-    end
     @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
   end
 
   def test_validate_default_nanoparticle_model
     model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
     cv = CrossValidation.create model
-    p cv
-    p cv.rmse
-    p cv.r_squared
     #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
     refute_nil cv.r_squared
     refute_nil cv.rmse
   end
 
-  def test_validate_pls_nanoparticle_model
+  def test_validate_pls_pchem_model
     algorithms = {
       :descriptors => {
         :method => "properties",
         :categories => ["P-CHEM"]
       },
       :prediction => {:method => 'Algorithm::Caret.pls' },
+      :feature_selection => {
+        :method => "Algorithm::FeatureSelection.correlation_filter",
+      },
     }
     model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
     assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
     cv = CrossValidation.create model
-    p cv.rmse
-    p cv.r_squared
     refute_nil cv.r_squared
     refute_nil cv.rmse
   end
 
-  def test_validate_proteomics_pls_nanoparticle_model
+=begin
+  def test_validate_proteomics_pls_pchem_model
     algorithms = {
       :descriptors => {
         :method => "properties",
         :categories => ["Proteomics"]
       },
       :prediction => {:method => 'Algorithm::Caret.pls' },
+      :feature_selection => {
+        :method => "Algorithm::FeatureSelection.correlation_filter",
+      },
     }
     model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
     assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
     cv = CrossValidation.create model
-    p cv.rmse
-    p cv.r_squared
     refute_nil cv.r_squared
     refute_nil cv.rmse
   end
+=end
 
-  def test_validate_all_default_nanoparticle_model
+  def test_validate_proteomics_pchem_default_model
     algorithms = {
       :descriptors => {
         :method => "properties",
         :categories => ["Proteomics","P-CHEM"]
       },
+      :feature_selection => {
+        :method => "Algorithm::FeatureSelection.correlation_filter",
+      },
     }
     model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
     cv = CrossValidation.create model
-    p cv.rmse
-    p cv.r_squared
     refute_nil cv.r_squared
     refute_nil cv.rmse
   end
 
-  def test_nanoparticle_fingerprint_model
+  def test_nanoparticle_fingerprint_model_without_feature_selection
     algorithms = {
       :descriptors => {
         :method => "fingerprint",
@@ -86,13 +84,11 @@ class NanoparticleValidationTest  < MiniTest::Test
     }
     model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
     cv = CrossValidation.create model
-    p cv.rmse
-    p cv.r_squared
     refute_nil cv.r_squared
     refute_nil cv.rmse
   end
 
-  def test_nanoparticle_fingerprint_weighted_average_model
+  def test_nanoparticle_fingerprint_weighted_average_model_without_feature_selection
     algorithms = {
       :descriptors => {
         :method => "fingerprint",
@@ -107,8 +103,6 @@ class NanoparticleValidationTest  < MiniTest::Test
     }
     model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
     cv = CrossValidation.create model
-    p cv.rmse
-    p cv.r_squared
     refute_nil cv.r_squared
     refute_nil cv.rmse
   end
@@ -123,11 +117,12 @@ class NanoparticleValidationTest  < MiniTest::Test
         :method => "Algorithm::Similarity.tanimoto",
         :min => 0.1
       },
+      :feature_selection => {
+        :method => "Algorithm::FeatureSelection.correlation_filter",
+      },
     }
     model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
     cv = CrossValidation.create model
-    p cv.rmse
-    p cv.r_squared
     refute_nil cv.r_squared
     refute_nil cv.rmse
   end
diff --git a/test/validation-regression.rb b/test/validation-regression.rb
index a0895f9..7630521 100644
--- a/test/validation-regression.rb
+++ b/test/validation-regression.rb
@@ -86,7 +86,6 @@ class ValidationRegressionTest < MiniTest::Test
       #assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
       #assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
     end
-    p repeated_cv
     File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot}
   end
 
-- 
cgit v1.2.3