obsolete files removed

author: Christoph Helma <helma@in-silico.ch> 2019-08-24 15:01:04 +0200
committer: Christoph Helma <helma@in-silico.ch> 2019-08-24 15:01:04 +0200
commit: a35be3d59a513701f8822af5b56510647d8d531c (patch)
tree: 432efa0d6be991a2fc81fcc6f40337f5c77452b6
parent: 1f789133d961c29d3babfaf69cdde3d675288537 (diff)
16 files changed, 21 insertions, 794 deletions
diff --git a/bin/classification_crossvalidation.rb b/bin/classification_crossvalidation.rb
new file mode 100755
index 0000000..4bd6bc6
--- /dev/null
+++ b/bin/classification_crossvalidation.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+model = ClassificationModel.new ARGV[0]
+model.crossvalidation
diff --git a/bin/classification_summary.rb b/bin/classification_summary.rb
new file mode 100755
index 0000000..a3e4172
--- /dev/null
+++ b/bin/classification_summary.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+stat = ClassificationStatistics.new ARGV[0]
+stat.summary
diff --git a/bin/confusion_matrix.rb b/bin/confusion_matrix.rb
new file mode 100755
index 0000000..789262d
--- /dev/null
+++ b/bin/confusion_matrix.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+stat = ClassificationStatistics.new ARGV[0]
+stat.confusion_matrix
diff --git a/bin/fingerprint_independent_variables.rb b/bin/fingerprint_independent_variables.rb
new file mode 100755
index 0000000..7dea239
--- /dev/null
+++ b/bin/fingerprint_independent_variables.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+dataset = Dataset.new ARGV[0]
+dataset.fingerprint_independent_variables ARGV[0]
diff --git a/bin/scale_independent_variables.rb b/bin/scale_independent_variables.rb
new file mode 100755
index 0000000..1d7662a
--- /dev/null
+++ b/bin/scale_independent_variables.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+dataset = Dataset.new ARGV[0]
+dataset.scale_independent_variables ARGV[0]
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
deleted file mode 100644
index f70ac1a..0000000
--- a/lib/algorithm.rb
+++ /dev/null
@@ -1,13 +0,0 @@
-module OpenTox
-
-  module Algorithm 
-
-    # Execute an algorithm with parameters
-    def self.run algorithm, parameters=nil
-      klass,method = algorithm.split('.')
-      Object.const_get(klass).send(method,parameters) 
-    end
-
-  end
-end
-
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
deleted file mode 100644
index e1761bc..0000000
--- a/lib/crossvalidation.rb
+++ /dev/null
@@ -1,117 +0,0 @@
-module OpenTox
-
-  module Validation
-
-    # Crossvalidation
-    class CrossValidation < Validation
-      field :validation_ids, type: Array, default: []
-      field :folds, type: Integer, default: 10
-
-      # Create a crossvalidation
-      # @param [OpenTox::Model::Lazar]
-      # @param [Fixnum] number of folds
-      # @return [OpenTox::Validation::CrossValidation]
-      def self.create model, n=10
-        $logger.debug model.algorithms
-        klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
-        klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
-        raise ArgumentError, "Unknown model class #{model.class}." unless klass
-
-        cv = klass.new(
-          name: model.name,
-          model_id: model.id,
-          folds: n
-        )
-        cv.save # set created_at
-
-        training_dataset = model.training_dataset
-        training_dataset.folds(n).each_with_index do |fold,fold_nr|
-          #fork do # parallel execution of validations can lead to Rserve and memory problems
-          $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
-          t = Time.now
-          validation = TrainTest.create(model, fold[0], fold[1])
-          cv.validation_ids << validation.id
-          $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
-        end
-        cv.save
-        cv.statistics
-        cv.update_attributes(finished_at: Time.now)
-        cv
-      end
-
-      # Get execution time
-      # @return [Fixnum]
-      def time
-        finished_at - created_at
-      end
-
-      # Get individual validations
-      # @return [Array<OpenTox::Validation>]
-      def validations
-        validation_ids.collect{|vid| TrainTest.find vid}
-      end
-
-      # Get predictions for all compounds
-      # @return [Array<Hash>]
-      def predictions
-        predictions = {}
-        validations.each{|v| predictions.merge!(v.predictions)}
-        predictions
-      end
-    end
-
-    # Crossvalidation of classification models
-    class ClassificationCrossValidation < CrossValidation
-      include ClassificationStatistics
-      field :accept_values, type: Array
-      field :confusion_matrix, type: Hash
-      field :accuracy, type: Hash
-      field :true_rate, type: Hash
-      field :predictivity, type: Hash
-      field :nr_predictions, type: Hash
-      field :probability_plot_id, type: BSON::ObjectId
-    end
-
-    # Crossvalidation of regression models
-    class RegressionCrossValidation < CrossValidation
-      include RegressionStatistics
-      field :rmse, type: Hash
-      field :mae, type: Hash
-      field :r_squared, type: Hash
-      field :within_prediction_interval, type: Hash
-      field :out_of_prediction_interval, type: Hash
-      field :nr_predictions, type: Hash
-      field :warnings, type: Array
-      field :correlation_plot_id, type: BSON::ObjectId
-    end
-
-    # Independent repeated crossvalidations
-    class RepeatedCrossValidation < Validation
-      field :crossvalidation_ids, type: Array, default: []
-      field :correlation_plot_id, type: BSON::ObjectId
-
-      # Create repeated crossvalidations
-      # @param [OpenTox::Model::Lazar]
-      # @param [Fixnum] number of folds
-      # @param [Fixnum] number of repeats
-      # @return [OpenTox::Validation::RepeatedCrossValidation]
-      def self.create model, folds=10, repeats=5
-        repeated_cross_validation = self.new
-        repeats.times do |n|
-          $logger.debug "Crossvalidation #{n+1} for #{model.name}"
-          repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
-        end
-        repeated_cross_validation.save
-        repeated_cross_validation
-      end
-
-      # Get crossvalidations
-      # @return [OpenTox::Validation::CrossValidation]
-      def crossvalidations
-        crossvalidation_ids.collect{|id| CrossValidation.find(id)}
-      end
-
-    end
-  end
-
-end
diff --git a/lib/enm-import.rb b/lib/enm-import.rb
deleted file mode 100644
index cf1a26f..0000000
--- a/lib/enm-import.rb
+++ /dev/null
@@ -1,125 +0,0 @@
-module OpenTox
-
-  # Import data from external databases
-  module Import
-
-    class Enanomapper
-      include OpenTox
-
-      # Import from eNanoMapper
-      def self.import
-        # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
-        datasets = {}
-        bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"]
-        bundles.each do |bundle|
-          datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
-          $logger.debug bundle["title"].strip
-          nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"]
-          nanoparticles.each_with_index do |np,n|
-            core_id = nil
-            coating_ids = []
-            np["composition"].each do |c|
-              uri = c["component"]["compound"]["URI"]
-              data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json}))
-              source = data["dataEntry"][0]["compound"]["URI"]
-              smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
-              names = []
-              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
-              names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
-              if smiles
-                compound = Compound.find_or_create_by(:smiles => smiles)
-                compound.name = names.first
-                compound.names = names.compact
-              else
-                compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
-              end
-              compound.source = source
-              compound.save
-              if c["relation"] == "HAS_CORE"
-                core_id = compound.id.to_s
-              elsif c["relation"] == "HAS_COATING"
-                coating_ids << compound.id.to_s
-              end
-            end if np["composition"]
-            nanoparticle = Nanoparticle.find_or_create_by(
-              :name => np["values"]["https://data.enanomapper.net/identifier/name"],
-              :source => np["compound"]["URI"],
-              :core_id => core_id,
-              :coating_ids => coating_ids
-            )
-            #np["bundles"].keys.each do |bundle_uri|
-              #nanoparticle.dataset_ids << datasets[bundle_uri].id
-            #end
-
-            studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"]
-            studies.each do |study|
-              dataset = datasets[np["bundles"].keys.first]
-              proteomics_features = {}
-              category = study["protocol"]["topcategory"]
-              source = study["protocol"]["category"]["term"]
-              study["effects"].each do |effect|
-
-                effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
-                effect["conditions"].delete_if { |k, v| v.nil? }
-
-                if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-
-                  JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
-                    proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
-                    nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
-                  end
-                else
-                  name = effect["endpoint"]
-                  unit = effect["result"]["unit"]
-                  warnings = []
-                  case name
-                  when "Log2 transformed" # use a sensible name
-                    name = "log2(Net cell association)"
-                    warnings = ["Original name was 'Log2 transformed'"]
-                    unit = "log2(mL/ug(Mg))"
-                  when "Total protein (BCA assay)"
-                    category = "P-CHEM"
-                    warnings = ["Category changed from TOX to P-CHEM"]
-                  end
-                  feature = klass.find_or_create_by(
-                    :name => name,
-                    :unit => unit,
-                    :category => category,
-                    :conditions => effect["conditions"],
-                    :source => study["protocol"]["category"]["term"],
-                    :measured => true,
-                    :warnings => warnings
-                  )
-                  nanoparticle.parse_ambit_value feature, effect["result"], dataset
-                end
-              end
-            end
-            nanoparticle.save
-            print "#{n}, "
-          end
-          puts
-        end
-        datasets.each { |u,d| d.save }
-      end
-
-=begin
-      def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
-        #get list of bundle URIs
-        bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
-        datasets = []
-        bundles.each do |bundle|
-          uri = bundle["URI"]
-          study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
-          study["@graph"].each do |i|
-            puts i.to_yaml if i.keys.include? "sio:has-value"
-          end
-        end
-        datasets.collect{|d| d.id}
-      end
-=end
-
-    end
-
-  end
-
-end
diff --git a/lib/feature.rb b/lib/feature.rb
deleted file mode 100644
index 6f9d5c4..0000000
--- a/lib/feature.rb
+++ /dev/null
@@ -1,120 +0,0 @@
-module OpenTox
-
-  # Original ID (e.g. from CSV input)
-  class OriginalId < Feature
-    field :dataset_id, type: BSON::ObjectId
-  end
-
-  # Original SMILES (e.g. from CSV input)
-  class OriginalSmiles < Feature
-    field :dataset_id, type: BSON::ObjectId
-  end
-
-  # Warnings
-  class Warnings < Feature
-    field :dataset_id, type: BSON::ObjectId
-  end
-
-  # Confidence
-  class Confidence < Feature
-    field :dataset_id, type: BSON::ObjectId
-    def name
-      "Confidence"
-    end
-  end
-
-  # Categorical variables
-  class NominalFeature < Feature
-    field :accept_values, type: Array
-  end
-
-  # Quantitative variables
-  class NumericFeature < Feature
-    field :unit, type: String
-  end
-
-  # Nominal biological activity
-  class NominalBioActivity < NominalFeature
-  end
-
-  # Numeric biological activity
-  class NumericBioActivity < NumericFeature
-  end
-
-  # Merged nominal biological activity
-  class MergedNominalBioActivity < NominalBioActivity
-    field :original_feature_ids, type: Array
-    field :transformations, type: Array
-  end
-
-  # Merged numeric biological activity
-  class MergedNumericBioActivity < NumericBioActivity
-    field :original_feature_ids, type: Array
-  end
-
-  # Transformed nominal biological activity
-  class TransformedNominalBioActivity < NominalFeature
-    field :original_feature_id, type: BSON::ObjectId
-    field :transformation, type: Hash
-  end
-
-  # Transformed numeric biological activity
-  class TransformedNumericBioActivity < NumericFeature
-    field :original_feature_id, type: BSON::ObjectId
-    field :transformation, type: String
-  end
-
-  # Nominal lazar prediction
-  class NominalLazarPrediction < NominalFeature
-    field :model_id, type: BSON::ObjectId
-    field :training_feature_id, type: BSON::ObjectId
-    def name
-      "Prediction: #{self[:name]}"
-    end
-  end
-
-  class LazarPredictionProbability < NominalLazarPrediction
-    def name
-      "Probability: #{self[:name]}"
-    end
-  end
-
-  # Numeric lazar prediction
-  class NumericLazarPrediction < NumericFeature
-    field :model_id, type: BSON::ObjectId
-    field :training_feature_id, type: BSON::ObjectId
-    def name
-      "Prediction: #{self[:name]}"
-    end
-  end
-
-  class LazarPredictionInterval < NumericLazarPrediction
-    def name
-      "#{self[:name].capitalize} prediction interval"
-    end
-  end
-
-  class NominalSubstanceProperty < NominalFeature
-  end
-
-  class NumericSubstanceProperty < NumericFeature
-    field :category, type: String
-  end
-
-  class NanoParticleProperty < NumericSubstanceProperty
-    field :conditions, type: Hash
-  end
-
-  # Feature for SMARTS fragments
-  class Smarts < Feature
-    field :smarts, type: String 
-    index "smarts" => 1
-    # Create feature from SMARTS string
-    # @param [String]
-    # @return [OpenTox::Feature]
-    def self.from_smarts smarts
-      self.find_or_create_by :smarts => smarts
-    end
-  end
-
-end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2cc1321..f8a2732 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -27,30 +27,10 @@ suppressPackageStartupMessages({
 PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
 =end
 
-[ # be aware of the require sequence as it affects class/method overwrites
-  "array.rb",
-#  "overwrite.rb",
-#  "rest-client-wrapper.rb", 
-#  "opentox.rb",
-#  "feature.rb",
-#  "physchem.rb",
-#  "substance.rb",
+[ "array.rb",
   "compound.rb",
-#  "nanoparticle.rb",
   "dataset.rb",
-#  "algorithm.rb",
   "similarity.rb",
-#  "feature_selection.rb",
   "model.rb",
   "statistics.rb",
-#  "classification.rb",
-#  "regression.rb",
-#  "caret.rb",
-#  "validation-statistics.rb",
-#  "validation.rb",
-#  "train-test-validation.rb",
-#  "leave-one-out-validation.rb",
-#  "crossvalidation.rb",
-#  "download.rb",
-#  "import.rb",
 ].each{ |f| require_relative f }
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
deleted file mode 100644
index 7d73b89..0000000
--- a/lib/leave-one-out-validation.rb
+++ /dev/null
@@ -1,61 +0,0 @@
-module OpenTox
-
-  module Validation
-
-    # Leave one out validation
-    class LeaveOneOut < Validation
-
-      # Create a leave one out validation
-      # @param [OpenTox::Model::Lazar]
-      # @return [OpenTox::Validation::LeaveOneOut]
-      def self.create model
-        raise ArgumentError, "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
-        $logger.debug "#{model.name}: LOO validation started"
-        t = Time.now
-        model.training_dataset.features.collect{|f| f.class}.include?(NominalBioActivity) ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
-        loo = klass.new :model_id => model.id
-        predictions = model.predict model.training_dataset.substances
-        predictions.each{|cid,p| p.delete(:neighbors)}
-        predictions.each do |cid,prediction|
-          prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
-          predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
-        end
-        predictions.select!{|cid,p| p[:value] and p[:measurements]}
-        loo.predictions = predictions
-        loo.statistics
-        $logger.debug "#{model.name}, LOO validation:  #{Time.now-t} seconds"
-        loo
-      end
-
-    end
-
-    # Leave one out validation for classification models
-    class ClassificationLeaveOneOut < LeaveOneOut
-      include ClassificationStatistics
-      field :accept_values, type: Array
-      field :confusion_matrix, type: Hash
-      field :weighted_confusion_matrix, type: Hash
-      field :accuracy, type: Hash
-      field :weighted_accuracy, type: Hash
-      field :true_rate, type: Hash
-      field :predictivity, type: Hash
-      field :nr_predictions, type: Hash
-      field :probability_plot_id, type: BSON::ObjectId
-    end
-    
-    # Leave one out validation for regression models
-    class RegressionLeaveOneOut  < LeaveOneOut
-      include RegressionStatistics
-      field :rmse, type: Hash
-      field :mae, type: Hash
-      field :r_squared, type: Hash
-      field :within_prediction_interval, type: Hash
-      field :out_of_prediction_interval, type: Hash
-      field :nr_predictions, type: Hash
-      field :warnings, type: Array
-      field :correlation_plot_id, type: BSON::ObjectId
-    end
-
-  end
-
-end
diff --git a/lib/opentox.rb b/lib/opentox.rb
deleted file mode 100644
index fb2a579..0000000
--- a/lib/opentox.rb
+++ /dev/null
@@ -1,18 +0,0 @@
-module OpenTox
-
-  # create default OpenTox classes
-  # provides Mongoid's query and persistence methods
-  # http://mongoid.org/en/mongoid/docs/persistence.html
-  # http://mongoid.org/en/mongoid/docs/querying.html
-  CLASSES.each do |klass|
-    c = Class.new do
-      include OpenTox
-      include Mongoid::Document
-      include Mongoid::Timestamps
-      store_in collection: klass.downcase.pluralize
-      field :name,  type: String
-    end
-    OpenTox.const_set klass,c
-  end
-
-end
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
deleted file mode 100644
index db23e66..0000000
--- a/lib/rest-client-wrapper.rb
+++ /dev/null
@@ -1,97 +0,0 @@
-module OpenTox
-  
-  # Adjustments to the rest-client gem for OpenTox
-  class RestClientWrapper
-    
-    attr_accessor :request, :response
-
-    @@subjectid = nil
-
-    def self.subjectid=(subjectid)
-      @@subjectid = subjectid
-    end
-
-    def self.subjectid
-      @@subjectid
-    end
-
-    # REST methods 
-    # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
-    # Does not wait for task to finish and returns task uri
-    # @param [String] destination URI
-    # @param [optional,Hash|String] Payload data posted to the service
-    # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
-    # @return [RestClient::Response] REST call response 
-    [:head,:get,:post,:put,:delete].each do |method|
-
-      define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
-
-        uri = Addressable::URI.encode(uri)
-        # check input
-        raise ArgumentError, "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) 
-        headers[:subjectid] ||= @@subjectid
-        raise ArgumentError, "Invalid URI: '#{uri}'" unless URI.valid? uri
-        # make sure that no header parameters are set in the payload
-        [:accept,:content_type,:subjectid].each do |header|
-          if defined? $aa || URI(uri).host == URI($aa[:uri]).host
-          else
-            raise ArgumentError, "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
-          end
-        end
-      
-        # create request
-        args={}
-        args[:method] = method
-        args[:url] = uri
-        args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
-        args[:timeout] = 1800
-        args[:payload] = payload
-        headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
-        args[:headers] = headers
-
-        $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
-        
-        @request = RestClient::Request.new(args)
-        # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
-        @response = @request.execute do |response, request, result|
-          if [301, 302, 307].include? response.code and request.method == :get
-            response.follow_redirection(request, result)
-=begin
-          elsif response.code >= 400 and !URI.task?(uri)
-            error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
-            begin # errors are returned as error reports in json, try to parse
-              content = JSON.parse(response)
-              msg = content["message"].to_s
-              cause = content["errorCause"].to_s
-              raise if msg.size==0 && cause.size==0 # parsing failed
-            rescue # parsing error failed, use complete content as message
-              msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
-              cause = nil
-            end
-            Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method
-=end
-          else
-            response
-          end
-        end
-      end
-    end
-
-=begin
-    #@return [Array] of hashes with error code, method and class
-    def self.known_errors
-      errors = []
-      RestClient::STATUSES.each do |code,k|
-        if code >= 400
-          method = k.underscore.gsub(/ |'/,'_')
-          method += "_error" unless method.match(/_error$/)
-          klass = method.split("_").collect{|s| s.capitalize}.join("")
-          errors << {:code => code, :method => method.to_sym, :class => klass}
-        end
-      end
-      errors
-    end
-=end
-
-  end
-end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
deleted file mode 100644
index d034cd1..0000000
--- a/lib/train-test-validation.rb
+++ /dev/null
@@ -1,76 +0,0 @@
-module OpenTox
-
-  module Validation
-
-    # Training test set validation
-    class TrainTest < Validation
-
-      field :training_dataset_id, type: BSON::ObjectId
-      field :test_dataset_id, type: BSON::ObjectId
-
-      # Create a training test set validation
-      # @param [OpenTox::Model::Lazar]
-      # @param [OpenTox::Dataset] training dataset
-      # @param [OpenTox::Dataset] test dataset
-      # @return [OpenTox::Validation::TrainTest]
-      def self.create model, training_set, test_set
-        
-        validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
-        validation_model.save
-        predictions = validation_model.predict test_set.substances
-        predictions.each do |cid,prediction|
-          prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
-        end
-        predictions.select!{|cid,p| p[:value] and p[:measurements]}
-        # remove neighbors to avoid mongos file size limit error on large datasets
-        predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i)
-        validation = self.new(
-          :model_id => validation_model.id,
-          :test_dataset_id => test_set.id,
-          :predictions => predictions
-        )
-        validation.save
-        validation
-      end
-
-      # Get test dataset
-      # @return [OpenTox::Dataset]
-      def test_dataset
-        Dataset.find test_dataset_id
-      end
-
-      # Get training dataset
-      # @return [OpenTox::Dataset]
-      def training_dataset
-        Dataset.find training_dataset_id
-      end
-
-    end
-
-    # Training test set validation for classification models
-    class ClassificationTrainTest < TrainTest
-      include ClassificationStatistics
-      field :accept_values, type: Array
-      field :confusion_matrix, type: Array
-      field :weighted_confusion_matrix, type: Array
-      field :accuracy, type: Float
-      field :weighted_accuracy, type: Float
-      field :true_rate, type: Hash
-      field :predictivity, type: Hash
-      field :probability_plot_id, type: BSON::ObjectId
-    end
-
-    # Training test set validation for regression models
-    class RegressionTrainTest < TrainTest
-      include RegressionStatistics
-      field :rmse, type: Float, default:0
-      field :mae, type: Float, default:0
-      field :r_squared, type: Float
-      field :within_prediction_interval, type: Integer, default:0
-      field :out_of_prediction_interval, type: Integer, default:0
-      field :correlation_plot_id, type: BSON::ObjectId
-    end
-
-  end
-
-end
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
deleted file mode 100644
index fc10cd4..0000000
--- a/lib/unique_descriptors.rb
+++ /dev/null
@@ -1,120 +0,0 @@
-# set of non redundant descriptors, faster algorithms are preferred
-# TODO:
-# select logP algorithm
-# select l5 algorithm
-# use smarts matcher for atom counts
-# check correlations
-UNIQUEDESCRIPTORS = [
-  "Openbabel.abonds", #Number of aromatic bonds
-  "Openbabel.atoms", #Number of atoms
-  "Openbabel.bonds", #Number of bonds
-  "Openbabel.dbonds", #Number of double bonds
-  "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
-  "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
-  "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
-  #"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
-  "Openbabel.logP", #octanol/water partition coefficient
-  "Openbabel.MP", #Melting point
-  "Openbabel.MR", #molar refractivity
-  "Openbabel.MW", #Molecular Weight filter
-  "Openbabel.nF", #Number of Fluorine Atoms
-  "Openbabel.sbonds", #Number of single bonds
-  "Openbabel.tbonds", #Number of triple bonds
-  "Openbabel.TPSA", #topological polar surface area
-  "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
-  "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
-  "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
-  #"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
-  #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
-  #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
-  #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
-  "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
-  "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
-  "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
-  "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
-  "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
-  "Cdk.BasicGroupCount", #Returns the number of basic groups.
-  #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
-  "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
-  "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
-  "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
-  "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
-  "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
-  "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
-  "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
-  "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
-  "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
-  "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
-  #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
-  #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
-  "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
-  # TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable.
-  #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
-  "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
-  "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
-  "Cdk.LargestChain", #Returns the number of atoms in the largest chain
-  "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
-  "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
-  "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
-  "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
-  #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
-  "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
-  "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
-  "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
-  "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
-  #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
-  #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
-  "Cdk.VABC", #Describes the volume of a molecule.
-  "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
-  "Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
-  #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
-  "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
-  "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
-  "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
-  "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
-  "Joelib.count.NumberOfS", #no description available
-  "Joelib.count.NumberOfP", #no description available
-  "Joelib.count.NumberOfO", #no description available
-  "Joelib.count.NumberOfN", #no description available
-  #"Joeli#.count.AromaticBonds", #no description available
-  "Joelib.count.NumberOfI", #no description available
-  "Joelib.count.NumberOfF", #no description available
-  "Joelib.count.NumberOfC", #no description available
-  "Joelib.count.NumberOfB", #no description available
-  "Joelib.count.HydrophobicGroups", #no description available
-  #"Joelib.KierShape3", #no description available
-  #"Joelib.KierShape2", #no description available
-  #"Joelib.KierShape1", #no description available
-  #"Joelib.count.AcidicGroups", #no description available
-  "Joelib.count.AliphaticOHGroups", #no description available
-  #"Joelib.count.NumberOfAtoms", #no description available
-  "Joelib.TopologicalRadius", #no description available
-  "Joelib.GeometricalShapeCoefficient", #no description available
-  #"Joelib.MolecularWeight", #no description available
-  "Joelib.FractionRotatableBonds", #no description available
-  #"Joeli..count.HBD2", #no description available
-  #"Joelib.count.HBD1", #no description available
-  "Joelib.LogP", #no description available
-  "Joelib.GraphShapeCoefficient", #no description available
-  "Joelib.count.BasicGroups", #no description available
-  #"Joelib.count.RotatableBonds", #no description available
-  "Joelib.count.HeavyBonds", #no description available
-  "Joelib.PolarSurfaceArea", #no description available
-  #"Joelib.ZagrebIndex1", #no description available
-  "Joelib.GeometricalRadius", #no description available
-  "Joelib.count.SO2Groups", #no description available
-  "Joelib.count.AromaticOHGroups", #no description available
-  "Joelib.GeometricalDiameter", #no description available
-  #"Joelib.MolarRefractivity", #no description available
-  "Joelib.count.NumberOfCl", #no description available
-  "Joelib.count.OSOGroups", #no description available
-  "Joelib.count.NumberOfBr", #no description available
-  "Joelib.count.NO2Groups", #no description available
-  "Joelib.count.HeteroCycles", #no description available
-  #"Joelib.count.HBA2", #no description available
-  #"Joelib.count.HBA1", #no description available
-  #"Joelib.count.NumberOfBonds", #no description available
-  "Joelib.count.SOGroups", #no description available
-  "Joelib.TopologicalDiameter", #no description available
-  "Joelib.count.NumberOfHal", #no description available
-]
diff --git a/lib/validation.rb b/lib/validation.rb
deleted file mode 100644
index 9402361..0000000
--- a/lib/validation.rb
+++ /dev/null
@@ -1,26 +0,0 @@
-module OpenTox
-
-  module Validation
-
-    # Base validation class
-    class Validation
-      include OpenTox
-      include Mongoid::Document
-      include Mongoid::Timestamps
-      store_in collection: "validations"
-      field :name, type: String
-      field :model_id, type: BSON::ObjectId
-      field :predictions, type: Hash, default: {}
-      field :finished_at, type: Time 
-
-      # Get model
-      # @return [OpenTox::Model::Lazar]
-      def model
-        Model::Lazar.find model_id
-      end
-
-    end
-
-  end
-
-end
author	Christoph Helma <helma@in-silico.ch>	2019-08-24 15:01:04 +0200
committer	Christoph Helma <helma@in-silico.ch>	2019-08-24 15:01:04 +0200
commit	a35be3d59a513701f8822af5b56510647d8d531c (patch)
tree	432efa0d6be991a2fc81fcc6f40337f5c77452b6
parent	1f789133d961c29d3babfaf69cdde3d675288537 (diff)