summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2019-08-24 15:01:04 +0200
committerChristoph Helma <helma@in-silico.ch>2019-08-24 15:01:04 +0200
commita35be3d59a513701f8822af5b56510647d8d531c (patch)
tree432efa0d6be991a2fc81fcc6f40337f5c77452b6
parent1f789133d961c29d3babfaf69cdde3d675288537 (diff)
obsolete files removed
-rwxr-xr-xbin/classification_crossvalidation.rb4
-rwxr-xr-xbin/classification_summary.rb4
-rwxr-xr-xbin/confusion_matrix.rb4
-rwxr-xr-xbin/fingerprint_independent_variables.rb4
-rwxr-xr-xbin/scale_independent_variables.rb4
-rw-r--r--lib/algorithm.rb13
-rw-r--r--lib/crossvalidation.rb117
-rw-r--r--lib/enm-import.rb125
-rw-r--r--lib/feature.rb120
-rw-r--r--lib/lazar.rb22
-rw-r--r--lib/leave-one-out-validation.rb61
-rw-r--r--lib/opentox.rb18
-rw-r--r--lib/rest-client-wrapper.rb97
-rw-r--r--lib/train-test-validation.rb76
-rw-r--r--lib/unique_descriptors.rb120
-rw-r--r--lib/validation.rb26
16 files changed, 21 insertions, 794 deletions
diff --git a/bin/classification_crossvalidation.rb b/bin/classification_crossvalidation.rb
new file mode 100755
index 0000000..4bd6bc6
--- /dev/null
+++ b/bin/classification_crossvalidation.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+model = ClassificationModel.new ARGV[0]
+model.crossvalidation
diff --git a/bin/classification_summary.rb b/bin/classification_summary.rb
new file mode 100755
index 0000000..a3e4172
--- /dev/null
+++ b/bin/classification_summary.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+stat = ClassificationStatistics.new ARGV[0]
+stat.summary
diff --git a/bin/confusion_matrix.rb b/bin/confusion_matrix.rb
new file mode 100755
index 0000000..789262d
--- /dev/null
+++ b/bin/confusion_matrix.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+stat = ClassificationStatistics.new ARGV[0]
+stat.confusion_matrix
diff --git a/bin/fingerprint_independent_variables.rb b/bin/fingerprint_independent_variables.rb
new file mode 100755
index 0000000..7dea239
--- /dev/null
+++ b/bin/fingerprint_independent_variables.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+dataset = Dataset.new ARGV[0]
+dataset.fingerprint_independent_variables ARGV[0]
diff --git a/bin/scale_independent_variables.rb b/bin/scale_independent_variables.rb
new file mode 100755
index 0000000..1d7662a
--- /dev/null
+++ b/bin/scale_independent_variables.rb
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+require_relative "../lib/lazar"
+dataset = Dataset.new ARGV[0]
+dataset.scale_independent_variables ARGV[0]
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
deleted file mode 100644
index f70ac1a..0000000
--- a/lib/algorithm.rb
+++ /dev/null
@@ -1,13 +0,0 @@
-module OpenTox
-
- module Algorithm
-
- # Execute an algorithm with parameters
- def self.run algorithm, parameters=nil
- klass,method = algorithm.split('.')
- Object.const_get(klass).send(method,parameters)
- end
-
- end
-end
-
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
deleted file mode 100644
index e1761bc..0000000
--- a/lib/crossvalidation.rb
+++ /dev/null
@@ -1,117 +0,0 @@
-module OpenTox
-
- module Validation
-
- # Crossvalidation
- class CrossValidation < Validation
- field :validation_ids, type: Array, default: []
- field :folds, type: Integer, default: 10
-
- # Create a crossvalidation
- # @param [OpenTox::Model::Lazar]
- # @param [Fixnum] number of folds
- # @return [OpenTox::Validation::CrossValidation]
- def self.create model, n=10
- $logger.debug model.algorithms
- klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
- klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
- raise ArgumentError, "Unknown model class #{model.class}." unless klass
-
- cv = klass.new(
- name: model.name,
- model_id: model.id,
- folds: n
- )
- cv.save # set created_at
-
- training_dataset = model.training_dataset
- training_dataset.folds(n).each_with_index do |fold,fold_nr|
- #fork do # parallel execution of validations can lead to Rserve and memory problems
- $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
- t = Time.now
- validation = TrainTest.create(model, fold[0], fold[1])
- cv.validation_ids << validation.id
- $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
- end
- cv.save
- cv.statistics
- cv.update_attributes(finished_at: Time.now)
- cv
- end
-
- # Get execution time
- # @return [Fixnum]
- def time
- finished_at - created_at
- end
-
- # Get individual validations
- # @return [Array<OpenTox::Validation>]
- def validations
- validation_ids.collect{|vid| TrainTest.find vid}
- end
-
- # Get predictions for all compounds
- # @return [Array<Hash>]
- def predictions
- predictions = {}
- validations.each{|v| predictions.merge!(v.predictions)}
- predictions
- end
- end
-
- # Crossvalidation of classification models
- class ClassificationCrossValidation < CrossValidation
- include ClassificationStatistics
- field :accept_values, type: Array
- field :confusion_matrix, type: Hash
- field :accuracy, type: Hash
- field :true_rate, type: Hash
- field :predictivity, type: Hash
- field :nr_predictions, type: Hash
- field :probability_plot_id, type: BSON::ObjectId
- end
-
- # Crossvalidation of regression models
- class RegressionCrossValidation < CrossValidation
- include RegressionStatistics
- field :rmse, type: Hash
- field :mae, type: Hash
- field :r_squared, type: Hash
- field :within_prediction_interval, type: Hash
- field :out_of_prediction_interval, type: Hash
- field :nr_predictions, type: Hash
- field :warnings, type: Array
- field :correlation_plot_id, type: BSON::ObjectId
- end
-
- # Independent repeated crossvalidations
- class RepeatedCrossValidation < Validation
- field :crossvalidation_ids, type: Array, default: []
- field :correlation_plot_id, type: BSON::ObjectId
-
- # Create repeated crossvalidations
- # @param [OpenTox::Model::Lazar]
- # @param [Fixnum] number of folds
- # @param [Fixnum] number of repeats
- # @return [OpenTox::Validation::RepeatedCrossValidation]
- def self.create model, folds=10, repeats=5
- repeated_cross_validation = self.new
- repeats.times do |n|
- $logger.debug "Crossvalidation #{n+1} for #{model.name}"
- repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
- end
- repeated_cross_validation.save
- repeated_cross_validation
- end
-
- # Get crossvalidations
- # @return [OpenTox::Validation::CrossValidation]
- def crossvalidations
- crossvalidation_ids.collect{|id| CrossValidation.find(id)}
- end
-
- end
- end
-
-end
diff --git a/lib/enm-import.rb b/lib/enm-import.rb
deleted file mode 100644
index cf1a26f..0000000
--- a/lib/enm-import.rb
+++ /dev/null
@@ -1,125 +0,0 @@
-module OpenTox
-
- # Import data from external databases
- module Import
-
- class Enanomapper
- include OpenTox
-
- # Import from eNanoMapper
- def self.import
- # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
- datasets = {}
- bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"]
- bundles.each do |bundle|
- datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
- $logger.debug bundle["title"].strip
- nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"]
- nanoparticles.each_with_index do |np,n|
- core_id = nil
- coating_ids = []
- np["composition"].each do |c|
- uri = c["component"]["compound"]["URI"]
- data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json}))
- source = data["dataEntry"][0]["compound"]["URI"]
- smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
- names = []
- names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
- names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
- if smiles
- compound = Compound.find_or_create_by(:smiles => smiles)
- compound.name = names.first
- compound.names = names.compact
- else
- compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
- end
- compound.source = source
- compound.save
- if c["relation"] == "HAS_CORE"
- core_id = compound.id.to_s
- elsif c["relation"] == "HAS_COATING"
- coating_ids << compound.id.to_s
- end
- end if np["composition"]
- nanoparticle = Nanoparticle.find_or_create_by(
- :name => np["values"]["https://data.enanomapper.net/identifier/name"],
- :source => np["compound"]["URI"],
- :core_id => core_id,
- :coating_ids => coating_ids
- )
- #np["bundles"].keys.each do |bundle_uri|
- #nanoparticle.dataset_ids << datasets[bundle_uri].id
- #end
-
- studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"]
- studies.each do |study|
- dataset = datasets[np["bundles"].keys.first]
- proteomics_features = {}
- category = study["protocol"]["topcategory"]
- source = study["protocol"]["category"]["term"]
- study["effects"].each do |effect|
-
- effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
- effect["conditions"].delete_if { |k, v| v.nil? }
-
- if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-
- JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
- proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
- nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
- end
- else
- name = effect["endpoint"]
- unit = effect["result"]["unit"]
- warnings = []
- case name
- when "Log2 transformed" # use a sensible name
- name = "log2(Net cell association)"
- warnings = ["Original name was 'Log2 transformed'"]
- unit = "log2(mL/ug(Mg))"
- when "Total protein (BCA assay)"
- category = "P-CHEM"
- warnings = ["Category changed from TOX to P-CHEM"]
- end
- feature = klass.find_or_create_by(
- :name => name,
- :unit => unit,
- :category => category,
- :conditions => effect["conditions"],
- :source => study["protocol"]["category"]["term"],
- :measured => true,
- :warnings => warnings
- )
- nanoparticle.parse_ambit_value feature, effect["result"], dataset
- end
- end
- end
- nanoparticle.save
- print "#{n}, "
- end
- puts
- end
- datasets.each { |u,d| d.save }
- end
-
-=begin
- def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
- #get list of bundle URIs
- bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
- datasets = []
- bundles.each do |bundle|
- uri = bundle["URI"]
- study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
- study["@graph"].each do |i|
- puts i.to_yaml if i.keys.include? "sio:has-value"
- end
- end
- datasets.collect{|d| d.id}
- end
-=end
-
- end
-
- end
-
-end
diff --git a/lib/feature.rb b/lib/feature.rb
deleted file mode 100644
index 6f9d5c4..0000000
--- a/lib/feature.rb
+++ /dev/null
@@ -1,120 +0,0 @@
-module OpenTox
-
- # Original ID (e.g. from CSV input)
- class OriginalId < Feature
- field :dataset_id, type: BSON::ObjectId
- end
-
- # Original SMILES (e.g. from CSV input)
- class OriginalSmiles < Feature
- field :dataset_id, type: BSON::ObjectId
- end
-
- # Warnings
- class Warnings < Feature
- field :dataset_id, type: BSON::ObjectId
- end
-
- # Confidence
- class Confidence < Feature
- field :dataset_id, type: BSON::ObjectId
- def name
- "Confidence"
- end
- end
-
- # Categorical variables
- class NominalFeature < Feature
- field :accept_values, type: Array
- end
-
- # Quantitative variables
- class NumericFeature < Feature
- field :unit, type: String
- end
-
- # Nominal biological activity
- class NominalBioActivity < NominalFeature
- end
-
- # Numeric biological activity
- class NumericBioActivity < NumericFeature
- end
-
- # Merged nominal biological activity
- class MergedNominalBioActivity < NominalBioActivity
- field :original_feature_ids, type: Array
- field :transformations, type: Array
- end
-
- # Merged numeric biological activity
- class MergedNumericBioActivity < NumericBioActivity
- field :original_feature_ids, type: Array
- end
-
- # Transformed nominal biological activity
- class TransformedNominalBioActivity < NominalFeature
- field :original_feature_id, type: BSON::ObjectId
- field :transformation, type: Hash
- end
-
- # Transformed numeric biological activity
- class TransformedNumericBioActivity < NumericFeature
- field :original_feature_id, type: BSON::ObjectId
- field :transformation, type: String
- end
-
- # Nominal lazar prediction
- class NominalLazarPrediction < NominalFeature
- field :model_id, type: BSON::ObjectId
- field :training_feature_id, type: BSON::ObjectId
- def name
- "Prediction: #{self[:name]}"
- end
- end
-
- class LazarPredictionProbability < NominalLazarPrediction
- def name
- "Probability: #{self[:name]}"
- end
- end
-
- # Numeric lazar prediction
- class NumericLazarPrediction < NumericFeature
- field :model_id, type: BSON::ObjectId
- field :training_feature_id, type: BSON::ObjectId
- def name
- "Prediction: #{self[:name]}"
- end
- end
-
- class LazarPredictionInterval < NumericLazarPrediction
- def name
- "#{self[:name].capitalize} prediction interval"
- end
- end
-
- class NominalSubstanceProperty < NominalFeature
- end
-
- class NumericSubstanceProperty < NumericFeature
- field :category, type: String
- end
-
- class NanoParticleProperty < NumericSubstanceProperty
- field :conditions, type: Hash
- end
-
- # Feature for SMARTS fragments
- class Smarts < Feature
- field :smarts, type: String
- index "smarts" => 1
- # Create feature from SMARTS string
- # @param [String]
- # @return [OpenTox::Feature]
- def self.from_smarts smarts
- self.find_or_create_by :smarts => smarts
- end
- end
-
-end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2cc1321..f8a2732 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -27,30 +27,10 @@ suppressPackageStartupMessages({
PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
=end
-[ # be aware of the require sequence as it affects class/method overwrites
- "array.rb",
-# "overwrite.rb",
-# "rest-client-wrapper.rb",
-# "opentox.rb",
-# "feature.rb",
-# "physchem.rb",
-# "substance.rb",
+[ "array.rb",
"compound.rb",
-# "nanoparticle.rb",
"dataset.rb",
-# "algorithm.rb",
"similarity.rb",
-# "feature_selection.rb",
"model.rb",
"statistics.rb",
-# "classification.rb",
-# "regression.rb",
-# "caret.rb",
-# "validation-statistics.rb",
-# "validation.rb",
-# "train-test-validation.rb",
-# "leave-one-out-validation.rb",
-# "crossvalidation.rb",
-# "download.rb",
-# "import.rb",
].each{ |f| require_relative f }
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
deleted file mode 100644
index 7d73b89..0000000
--- a/lib/leave-one-out-validation.rb
+++ /dev/null
@@ -1,61 +0,0 @@
-module OpenTox
-
- module Validation
-
- # Leave one out validation
- class LeaveOneOut < Validation
-
- # Create a leave one out validation
- # @param [OpenTox::Model::Lazar]
- # @return [OpenTox::Validation::LeaveOneOut]
- def self.create model
- raise ArgumentError, "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
- $logger.debug "#{model.name}: LOO validation started"
- t = Time.now
- model.training_dataset.features.collect{|f| f.class}.include?(NominalBioActivity) ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
- loo = klass.new :model_id => model.id
- predictions = model.predict model.training_dataset.substances
- predictions.each{|cid,p| p.delete(:neighbors)}
- predictions.each do |cid,prediction|
- prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
- predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
- end
- predictions.select!{|cid,p| p[:value] and p[:measurements]}
- loo.predictions = predictions
- loo.statistics
- $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
- loo
- end
-
- end
-
- # Leave one out validation for classification models
- class ClassificationLeaveOneOut < LeaveOneOut
- include ClassificationStatistics
- field :accept_values, type: Array
- field :confusion_matrix, type: Hash
- field :weighted_confusion_matrix, type: Hash
- field :accuracy, type: Hash
- field :weighted_accuracy, type: Hash
- field :true_rate, type: Hash
- field :predictivity, type: Hash
- field :nr_predictions, type: Hash
- field :probability_plot_id, type: BSON::ObjectId
- end
-
- # Leave one out validation for regression models
- class RegressionLeaveOneOut < LeaveOneOut
- include RegressionStatistics
- field :rmse, type: Hash
- field :mae, type: Hash
- field :r_squared, type: Hash
- field :within_prediction_interval, type: Hash
- field :out_of_prediction_interval, type: Hash
- field :nr_predictions, type: Hash
- field :warnings, type: Array
- field :correlation_plot_id, type: BSON::ObjectId
- end
-
- end
-
-end
diff --git a/lib/opentox.rb b/lib/opentox.rb
deleted file mode 100644
index fb2a579..0000000
--- a/lib/opentox.rb
+++ /dev/null
@@ -1,18 +0,0 @@
-module OpenTox
-
- # create default OpenTox classes
- # provides Mongoid's query and persistence methods
- # http://mongoid.org/en/mongoid/docs/persistence.html
- # http://mongoid.org/en/mongoid/docs/querying.html
- CLASSES.each do |klass|
- c = Class.new do
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: klass.downcase.pluralize
- field :name, type: String
- end
- OpenTox.const_set klass,c
- end
-
-end
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
deleted file mode 100644
index db23e66..0000000
--- a/lib/rest-client-wrapper.rb
+++ /dev/null
@@ -1,97 +0,0 @@
-module OpenTox
-
- # Adjustments to the rest-client gem for OpenTox
- class RestClientWrapper
-
- attr_accessor :request, :response
-
- @@subjectid = nil
-
- def self.subjectid=(subjectid)
- @@subjectid = subjectid
- end
-
- def self.subjectid
- @@subjectid
- end
-
- # REST methods
- # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
- # Does not wait for task to finish and returns task uri
- # @param [String] destination URI
- # @param [optional,Hash|String] Payload data posted to the service
- # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
- # @return [RestClient::Response] REST call response
- [:head,:get,:post,:put,:delete].each do |method|
-
- define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
-
- uri = Addressable::URI.encode(uri)
- # check input
- raise ArgumentError, "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash)
- headers[:subjectid] ||= @@subjectid
- raise ArgumentError, "Invalid URI: '#{uri}'" unless URI.valid? uri
- # make sure that no header parameters are set in the payload
- [:accept,:content_type,:subjectid].each do |header|
- if defined? $aa || URI(uri).host == URI($aa[:uri]).host
- else
- raise ArgumentError, "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
- end
- end
-
- # create request
- args={}
- args[:method] = method
- args[:url] = uri
- args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
- args[:timeout] = 1800
- args[:payload] = payload
- headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
- args[:headers] = headers
-
- $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
-
- @request = RestClient::Request.new(args)
- # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
- @response = @request.execute do |response, request, result|
- if [301, 302, 307].include? response.code and request.method == :get
- response.follow_redirection(request, result)
-=begin
- elsif response.code >= 400 and !URI.task?(uri)
- error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
- begin # errors are returned as error reports in json, try to parse
- content = JSON.parse(response)
- msg = content["message"].to_s
- cause = content["errorCause"].to_s
- raise if msg.size==0 && cause.size==0 # parsing failed
- rescue # parsing error failed, use complete content as message
- msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
- cause = nil
- end
- Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method
-=end
- else
- response
- end
- end
- end
- end
-
-=begin
- #@return [Array] of hashes with error code, method and class
- def self.known_errors
- errors = []
- RestClient::STATUSES.each do |code,k|
- if code >= 400
- method = k.underscore.gsub(/ |'/,'_')
- method += "_error" unless method.match(/_error$/)
- klass = method.split("_").collect{|s| s.capitalize}.join("")
- errors << {:code => code, :method => method.to_sym, :class => klass}
- end
- end
- errors
- end
-=end
-
- end
-end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
deleted file mode 100644
index d034cd1..0000000
--- a/lib/train-test-validation.rb
+++ /dev/null
@@ -1,76 +0,0 @@
-module OpenTox
-
- module Validation
-
- # Training test set validation
- class TrainTest < Validation
-
- field :training_dataset_id, type: BSON::ObjectId
- field :test_dataset_id, type: BSON::ObjectId
-
- # Create a training test set validation
- # @param [OpenTox::Model::Lazar]
- # @param [OpenTox::Dataset] training dataset
- # @param [OpenTox::Dataset] test dataset
- # @return [OpenTox::Validation::TrainTest]
- def self.create model, training_set, test_set
-
- validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
- validation_model.save
- predictions = validation_model.predict test_set.substances
- predictions.each do |cid,prediction|
- prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
- end
- predictions.select!{|cid,p| p[:value] and p[:measurements]}
- # remove neighbors to avoid mongos file size limit error on large datasets
- predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i)
- validation = self.new(
- :model_id => validation_model.id,
- :test_dataset_id => test_set.id,
- :predictions => predictions
- )
- validation.save
- validation
- end
-
- # Get test dataset
- # @return [OpenTox::Dataset]
- def test_dataset
- Dataset.find test_dataset_id
- end
-
- # Get training dataset
- # @return [OpenTox::Dataset]
- def training_dataset
- Dataset.find training_dataset_id
- end
-
- end
-
- # Training test set validation for classification models
- class ClassificationTrainTest < TrainTest
- include ClassificationStatistics
- field :accept_values, type: Array
- field :confusion_matrix, type: Array
- field :weighted_confusion_matrix, type: Array
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash
- field :predictivity, type: Hash
- field :probability_plot_id, type: BSON::ObjectId
- end
-
- # Training test set validation for regression models
- class RegressionTrainTest < TrainTest
- include RegressionStatistics
- field :rmse, type: Float, default:0
- field :mae, type: Float, default:0
- field :r_squared, type: Float
- field :within_prediction_interval, type: Integer, default:0
- field :out_of_prediction_interval, type: Integer, default:0
- field :correlation_plot_id, type: BSON::ObjectId
- end
-
- end
-
-end
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
deleted file mode 100644
index fc10cd4..0000000
--- a/lib/unique_descriptors.rb
+++ /dev/null
@@ -1,120 +0,0 @@
-# set of non redundant descriptors, faster algorithms are preferred
-# TODO:
-# select logP algorithm
-# select l5 algorithm
-# use smarts matcher for atom counts
-# check correlations
-UNIQUEDESCRIPTORS = [
- "Openbabel.abonds", #Number of aromatic bonds
- "Openbabel.atoms", #Number of atoms
- "Openbabel.bonds", #Number of bonds
- "Openbabel.dbonds", #Number of double bonds
- "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
- "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
- "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
- #"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
- "Openbabel.logP", #octanol/water partition coefficient
- "Openbabel.MP", #Melting point
- "Openbabel.MR", #molar refractivity
- "Openbabel.MW", #Molecular Weight filter
- "Openbabel.nF", #Number of Fluorine Atoms
- "Openbabel.sbonds", #Number of single bonds
- "Openbabel.tbonds", #Number of triple bonds
- "Openbabel.TPSA", #topological polar surface area
- "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
- "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
- "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
- #"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
- #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
- #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
- #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
- "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
- "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
- "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
- "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
- "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
- "Cdk.BasicGroupCount", #Returns the number of basic groups.
- #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
- "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
- "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
- "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
- "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
- "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
- "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
- "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
- "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
- "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
- "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
- #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
- #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
- "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
- # TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable.
- #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
- "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
- "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
- "Cdk.LargestChain", #Returns the number of atoms in the largest chain
- "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
- "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
- "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
- "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
- #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
- "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
- "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
- "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
- "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
- #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
- #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
- "Cdk.VABC", #Describes the volume of a molecule.
- "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
- "Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
- #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
- "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
- "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
- "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
- "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
- "Joelib.count.NumberOfS", #no description available
- "Joelib.count.NumberOfP", #no description available
- "Joelib.count.NumberOfO", #no description available
- "Joelib.count.NumberOfN", #no description available
- #"Joeli#.count.AromaticBonds", #no description available
- "Joelib.count.NumberOfI", #no description available
- "Joelib.count.NumberOfF", #no description available
- "Joelib.count.NumberOfC", #no description available
- "Joelib.count.NumberOfB", #no description available
- "Joelib.count.HydrophobicGroups", #no description available
- #"Joelib.KierShape3", #no description available
- #"Joelib.KierShape2", #no description available
- #"Joelib.KierShape1", #no description available
- #"Joelib.count.AcidicGroups", #no description available
- "Joelib.count.AliphaticOHGroups", #no description available
- #"Joelib.count.NumberOfAtoms", #no description available
- "Joelib.TopologicalRadius", #no description available
- "Joelib.GeometricalShapeCoefficient", #no description available
- #"Joelib.MolecularWeight", #no description available
- "Joelib.FractionRotatableBonds", #no description available
- #"Joeli..count.HBD2", #no description available
- #"Joelib.count.HBD1", #no description available
- "Joelib.LogP", #no description available
- "Joelib.GraphShapeCoefficient", #no description available
- "Joelib.count.BasicGroups", #no description available
- #"Joelib.count.RotatableBonds", #no description available
- "Joelib.count.HeavyBonds", #no description available
- "Joelib.PolarSurfaceArea", #no description available
- #"Joelib.ZagrebIndex1", #no description available
- "Joelib.GeometricalRadius", #no description available
- "Joelib.count.SO2Groups", #no description available
- "Joelib.count.AromaticOHGroups", #no description available
- "Joelib.GeometricalDiameter", #no description available
- #"Joelib.MolarRefractivity", #no description available
- "Joelib.count.NumberOfCl", #no description available
- "Joelib.count.OSOGroups", #no description available
- "Joelib.count.NumberOfBr", #no description available
- "Joelib.count.NO2Groups", #no description available
- "Joelib.count.HeteroCycles", #no description available
- #"Joelib.count.HBA2", #no description available
- #"Joelib.count.HBA1", #no description available
- #"Joelib.count.NumberOfBonds", #no description available
- "Joelib.count.SOGroups", #no description available
- "Joelib.TopologicalDiameter", #no description available
- "Joelib.count.NumberOfHal", #no description available
-]
diff --git a/lib/validation.rb b/lib/validation.rb
deleted file mode 100644
index 9402361..0000000
--- a/lib/validation.rb
+++ /dev/null
@@ -1,26 +0,0 @@
-module OpenTox
-
- module Validation
-
- # Base validation class
- class Validation
- include OpenTox
- include Mongoid::Document
- include Mongoid::Timestamps
- store_in collection: "validations"
- field :name, type: String
- field :model_id, type: BSON::ObjectId
- field :predictions, type: Hash, default: {}
- field :finished_at, type: Time
-
- # Get model
- # @return [OpenTox::Model::Lazar]
- def model
- Model::Lazar.find model_id
- end
-
- end
-
- end
-
-end