From a35be3d59a513701f8822af5b56510647d8d531c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 24 Aug 2019 15:01:04 +0200 Subject: obsolete files removed --- bin/classification_crossvalidation.rb | 4 + bin/classification_summary.rb | 4 + bin/confusion_matrix.rb | 4 + bin/fingerprint_independent_variables.rb | 4 + bin/scale_independent_variables.rb | 4 + lib/algorithm.rb | 13 ---- lib/crossvalidation.rb | 117 ----------------------------- lib/enm-import.rb | 125 ------------------------------- lib/feature.rb | 120 ----------------------------- lib/lazar.rb | 22 +----- lib/leave-one-out-validation.rb | 61 --------------- lib/opentox.rb | 18 ----- lib/rest-client-wrapper.rb | 97 ------------------------ lib/train-test-validation.rb | 76 ------------------- lib/unique_descriptors.rb | 120 ----------------------------- lib/validation.rb | 26 ------- 16 files changed, 21 insertions(+), 794 deletions(-) create mode 100755 bin/classification_crossvalidation.rb create mode 100755 bin/classification_summary.rb create mode 100755 bin/confusion_matrix.rb create mode 100755 bin/fingerprint_independent_variables.rb create mode 100755 bin/scale_independent_variables.rb delete mode 100644 lib/algorithm.rb delete mode 100644 lib/crossvalidation.rb delete mode 100644 lib/enm-import.rb delete mode 100644 lib/feature.rb delete mode 100644 lib/leave-one-out-validation.rb delete mode 100644 lib/opentox.rb delete mode 100644 lib/rest-client-wrapper.rb delete mode 100644 lib/train-test-validation.rb delete mode 100644 lib/unique_descriptors.rb delete mode 100644 lib/validation.rb diff --git a/bin/classification_crossvalidation.rb b/bin/classification_crossvalidation.rb new file mode 100755 index 0000000..4bd6bc6 --- /dev/null +++ b/bin/classification_crossvalidation.rb @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +require_relative "../lib/lazar" +model = ClassificationModel.new ARGV[0] +model.crossvalidation diff --git a/bin/classification_summary.rb b/bin/classification_summary.rb new file mode 100755 index 0000000..a3e4172 --- /dev/null +++ b/bin/classification_summary.rb @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +require_relative "../lib/lazar" +stat = ClassificationStatistics.new ARGV[0] +stat.summary diff --git a/bin/confusion_matrix.rb b/bin/confusion_matrix.rb new file mode 100755 index 0000000..789262d --- /dev/null +++ b/bin/confusion_matrix.rb @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +require_relative "../lib/lazar" +stat = ClassificationStatistics.new ARGV[0] +stat.confusion_matrix diff --git a/bin/fingerprint_independent_variables.rb b/bin/fingerprint_independent_variables.rb new file mode 100755 index 0000000..7dea239 --- /dev/null +++ b/bin/fingerprint_independent_variables.rb @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +require_relative "../lib/lazar" +dataset = Dataset.new ARGV[0] +dataset.fingerprint_independent_variables ARGV[0] diff --git a/bin/scale_independent_variables.rb b/bin/scale_independent_variables.rb new file mode 100755 index 0000000..1d7662a --- /dev/null +++ b/bin/scale_independent_variables.rb @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +require_relative "../lib/lazar" +dataset = Dataset.new ARGV[0] +dataset.scale_independent_variables ARGV[0] diff --git a/lib/algorithm.rb b/lib/algorithm.rb deleted file mode 100644 index f70ac1a..0000000 --- a/lib/algorithm.rb +++ /dev/null @@ -1,13 +0,0 @@ -module OpenTox - - module Algorithm - - # Execute an algorithm with parameters - def self.run algorithm, parameters=nil - klass,method = algorithm.split('.') - Object.const_get(klass).send(method,parameters) - end - - end -end - diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb deleted file mode 100644 index e1761bc..0000000 --- a/lib/crossvalidation.rb +++ /dev/null @@ -1,117 +0,0 @@ -module OpenTox - - module Validation - - # Crossvalidation - class CrossValidation < Validation - field :validation_ids, type: Array, default: [] - field :folds, type: Integer, default: 10 - - # Create a crossvalidation - # @param [OpenTox::Model::Lazar] - # @param [Fixnum] number of folds - # @return [OpenTox::Validation::CrossValidation] - def self.create model, n=10 - $logger.debug model.algorithms - klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification - klass = RegressionCrossValidation if model.is_a? Model::LazarRegression - raise ArgumentError, "Unknown model class #{model.class}." unless klass - - cv = klass.new( - name: model.name, - model_id: model.id, - folds: n - ) - cv.save # set created_at - - training_dataset = model.training_dataset - training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations can lead to Rserve and memory problems - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = TrainTest.create(model, fold[0], fold[1]) - cv.validation_ids << validation.id - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - end - cv.save - cv.statistics - cv.update_attributes(finished_at: Time.now) - cv - end - - # Get execution time - # @return [Fixnum] - def time - finished_at - created_at - end - - # Get individual validations - # @return [Array] - def validations - validation_ids.collect{|vid| TrainTest.find vid} - end - - # Get predictions for all compounds - # @return [Array] - def predictions - predictions = {} - validations.each{|v| predictions.merge!(v.predictions)} - predictions - end - end - - # Crossvalidation of classification models - class ClassificationCrossValidation < CrossValidation - include ClassificationStatistics - field :accept_values, type: Array - field :confusion_matrix, type: Hash - field :accuracy, type: Hash - field :true_rate, type: Hash - field :predictivity, type: Hash - field :nr_predictions, type: Hash - field :probability_plot_id, type: BSON::ObjectId - end - - # Crossvalidation of regression models - class RegressionCrossValidation < CrossValidation - include RegressionStatistics - field :rmse, type: Hash - field :mae, type: Hash - field :r_squared, type: Hash - field :within_prediction_interval, type: Hash - field :out_of_prediction_interval, type: Hash - field :nr_predictions, type: Hash - field :warnings, type: Array - field :correlation_plot_id, type: BSON::ObjectId - end - - # Independent repeated crossvalidations - class RepeatedCrossValidation < Validation - field :crossvalidation_ids, type: Array, default: [] - field :correlation_plot_id, type: BSON::ObjectId - - # Create repeated crossvalidations - # @param [OpenTox::Model::Lazar] - # @param [Fixnum] number of folds - # @param [Fixnum] number of repeats - # @return [OpenTox::Validation::RepeatedCrossValidation] - def self.create model, folds=10, repeats=5 - repeated_cross_validation = self.new - repeats.times do |n| - $logger.debug "Crossvalidation #{n+1} for #{model.name}" - repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id - end - repeated_cross_validation.save - repeated_cross_validation - end - - # Get crossvalidations - # @return [OpenTox::Validation::CrossValidation] - def crossvalidations - crossvalidation_ids.collect{|id| CrossValidation.find(id)} - end - - end - end - -end diff --git a/lib/enm-import.rb b/lib/enm-import.rb deleted file mode 100644 index cf1a26f..0000000 --- a/lib/enm-import.rb +++ /dev/null @@ -1,125 +0,0 @@ -module OpenTox - - # Import data from external databases - module Import - - class Enanomapper - include OpenTox - - # Import from eNanoMapper - def self.import - # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) - datasets = {} - bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"] - bundles.each do |bundle| - datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip) - $logger.debug bundle["title"].strip - nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"] - nanoparticles.each_with_index do |np,n| - core_id = nil - coating_ids = [] - np["composition"].each do |c| - uri = c["component"]["compound"]["URI"] - data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json})) - source = data["dataEntry"][0]["compound"]["URI"] - smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] - names = [] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] - if smiles - compound = Compound.find_or_create_by(:smiles => smiles) - compound.name = names.first - compound.names = names.compact - else - compound = Compound.find_or_create_by(:name => names.first,:names => names.compact) - end - compound.source = source - compound.save - if c["relation"] == "HAS_CORE" - core_id = compound.id.to_s - elsif c["relation"] == "HAS_COATING" - coating_ids << compound.id.to_s - end - end if np["composition"] - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - :core_id => core_id, - :coating_ids => coating_ids - ) - #np["bundles"].keys.each do |bundle_uri| - #nanoparticle.dataset_ids << datasets[bundle_uri].id - #end - - studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"] - studies.each do |study| - dataset = datasets[np["bundles"].keys.first] - proteomics_features = {} - category = study["protocol"]["topcategory"] - source = study["protocol"]["category"]["term"] - study["effects"].each do |effect| - - effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - effect["conditions"].delete_if { |k, v| v.nil? } - - if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data - - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) - nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - end - else - name = effect["endpoint"] - unit = effect["result"]["unit"] - warnings = [] - case name - when "Log2 transformed" # use a sensible name - name = "log2(Net cell association)" - warnings = ["Original name was 'Log2 transformed'"] - unit = "log2(mL/ug(Mg))" - when "Total protein (BCA assay)" - category = "P-CHEM" - warnings = ["Category changed from TOX to P-CHEM"] - end - feature = klass.find_or_create_by( - :name => name, - :unit => unit, - :category => category, - :conditions => effect["conditions"], - :source => study["protocol"]["category"]["term"], - :measured => true, - :warnings => warnings - ) - nanoparticle.parse_ambit_value feature, effect["result"], dataset - end - end - end - nanoparticle.save - print "#{n}, " - end - puts - end - datasets.each { |u,d| d.save } - end - -=begin - def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries - #get list of bundle URIs - bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] - datasets = [] - bundles.each do |bundle| - uri = bundle["URI"] - study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`) - study["@graph"].each do |i| - puts i.to_yaml if i.keys.include? "sio:has-value" - end - end - datasets.collect{|d| d.id} - end -=end - - end - - end - -end diff --git a/lib/feature.rb b/lib/feature.rb deleted file mode 100644 index 6f9d5c4..0000000 --- a/lib/feature.rb +++ /dev/null @@ -1,120 +0,0 @@ -module OpenTox - - # Original ID (e.g. from CSV input) - class OriginalId < Feature - field :dataset_id, type: BSON::ObjectId - end - - # Original SMILES (e.g. from CSV input) - class OriginalSmiles < Feature - field :dataset_id, type: BSON::ObjectId - end - - # Warnings - class Warnings < Feature - field :dataset_id, type: BSON::ObjectId - end - - # Confidence - class Confidence < Feature - field :dataset_id, type: BSON::ObjectId - def name - "Confidence" - end - end - - # Categorical variables - class NominalFeature < Feature - field :accept_values, type: Array - end - - # Quantitative variables - class NumericFeature < Feature - field :unit, type: String - end - - # Nominal biological activity - class NominalBioActivity < NominalFeature - end - - # Numeric biological activity - class NumericBioActivity < NumericFeature - end - - # Merged nominal biological activity - class MergedNominalBioActivity < NominalBioActivity - field :original_feature_ids, type: Array - field :transformations, type: Array - end - - # Merged numeric biological activity - class MergedNumericBioActivity < NumericBioActivity - field :original_feature_ids, type: Array - end - - # Transformed nominal biological activity - class TransformedNominalBioActivity < NominalFeature - field :original_feature_id, type: BSON::ObjectId - field :transformation, type: Hash - end - - # Transformed numeric biological activity - class TransformedNumericBioActivity < NumericFeature - field :original_feature_id, type: BSON::ObjectId - field :transformation, type: String - end - - # Nominal lazar prediction - class NominalLazarPrediction < NominalFeature - field :model_id, type: BSON::ObjectId - field :training_feature_id, type: BSON::ObjectId - def name - "Prediction: #{self[:name]}" - end - end - - class LazarPredictionProbability < NominalLazarPrediction - def name - "Probability: #{self[:name]}" - end - end - - # Numeric lazar prediction - class NumericLazarPrediction < NumericFeature - field :model_id, type: BSON::ObjectId - field :training_feature_id, type: BSON::ObjectId - def name - "Prediction: #{self[:name]}" - end - end - - class LazarPredictionInterval < NumericLazarPrediction - def name - "#{self[:name].capitalize} prediction interval" - end - end - - class NominalSubstanceProperty < NominalFeature - end - - class NumericSubstanceProperty < NumericFeature - field :category, type: String - end - - class NanoParticleProperty < NumericSubstanceProperty - field :conditions, type: Hash - end - - # Feature for SMARTS fragments - class Smarts < Feature - field :smarts, type: String - index "smarts" => 1 - # Create feature from SMARTS string - # @param [String] - # @return [OpenTox::Feature] - def self.from_smarts smarts - self.find_or_create_by :smarts => smarts - end - end - -end diff --git a/lib/lazar.rb b/lib/lazar.rb index 2cc1321..f8a2732 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -27,30 +27,10 @@ suppressPackageStartupMessages({ PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" =end -[ # be aware of the require sequence as it affects class/method overwrites - "array.rb", -# "overwrite.rb", -# "rest-client-wrapper.rb", -# "opentox.rb", -# "feature.rb", -# "physchem.rb", -# "substance.rb", +[ "array.rb", "compound.rb", -# "nanoparticle.rb", "dataset.rb", -# "algorithm.rb", "similarity.rb", -# "feature_selection.rb", "model.rb", "statistics.rb", -# "classification.rb", -# "regression.rb", -# "caret.rb", -# "validation-statistics.rb", -# "validation.rb", -# "train-test-validation.rb", -# "leave-one-out-validation.rb", -# "crossvalidation.rb", -# "download.rb", -# "import.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb deleted file mode 100644 index 7d73b89..0000000 --- a/lib/leave-one-out-validation.rb +++ /dev/null @@ -1,61 +0,0 @@ -module OpenTox - - module Validation - - # Leave one out validation - class LeaveOneOut < Validation - - # Create a leave one out validation - # @param [OpenTox::Model::Lazar] - # @return [OpenTox::Validation::LeaveOneOut] - def self.create model - raise ArgumentError, "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] - $logger.debug "#{model.name}: LOO validation started" - t = Time.now - model.training_dataset.features.collect{|f| f.class}.include?(NominalBioActivity) ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut - loo = klass.new :model_id => model.id - predictions = model.predict model.training_dataset.substances - predictions.each{|cid,p| p.delete(:neighbors)} - predictions.each do |cid,prediction| - prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value] - predictions.delete(cid) unless prediction[:value] and prediction[:measurements] - end - predictions.select!{|cid,p| p[:value] and p[:measurements]} - loo.predictions = predictions - loo.statistics - $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" - loo - end - - end - - # Leave one out validation for classification models - class ClassificationLeaveOneOut < LeaveOneOut - include ClassificationStatistics - field :accept_values, type: Array - field :confusion_matrix, type: Hash - field :weighted_confusion_matrix, type: Hash - field :accuracy, type: Hash - field :weighted_accuracy, type: Hash - field :true_rate, type: Hash - field :predictivity, type: Hash - field :nr_predictions, type: Hash - field :probability_plot_id, type: BSON::ObjectId - end - - # Leave one out validation for regression models - class RegressionLeaveOneOut < LeaveOneOut - include RegressionStatistics - field :rmse, type: Hash - field :mae, type: Hash - field :r_squared, type: Hash - field :within_prediction_interval, type: Hash - field :out_of_prediction_interval, type: Hash - field :nr_predictions, type: Hash - field :warnings, type: Array - field :correlation_plot_id, type: BSON::ObjectId - end - - end - -end diff --git a/lib/opentox.rb b/lib/opentox.rb deleted file mode 100644 index fb2a579..0000000 --- a/lib/opentox.rb +++ /dev/null @@ -1,18 +0,0 @@ -module OpenTox - - # create default OpenTox classes - # provides Mongoid's query and persistence methods - # http://mongoid.org/en/mongoid/docs/persistence.html - # http://mongoid.org/en/mongoid/docs/querying.html - CLASSES.each do |klass| - c = Class.new do - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: klass.downcase.pluralize - field :name, type: String - end - OpenTox.const_set klass,c - end - -end diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb deleted file mode 100644 index db23e66..0000000 --- a/lib/rest-client-wrapper.rb +++ /dev/null @@ -1,97 +0,0 @@ -module OpenTox - - # Adjustments to the rest-client gem for OpenTox - class RestClientWrapper - - attr_accessor :request, :response - - @@subjectid = nil - - def self.subjectid=(subjectid) - @@subjectid = subjectid - end - - def self.subjectid - @@subjectid - end - - # REST methods - # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502) - # Does not wait for task to finish and returns task uri - # @param [String] destination URI - # @param [optional,Hash|String] Payload data posted to the service - # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl - # @return [RestClient::Response] REST call response - [:head,:get,:post,:put,:delete].each do |method| - - define_singleton_method method do |uri,payload={},headers={},waiting_task=nil| - - uri = Addressable::URI.encode(uri) - # check input - raise ArgumentError, "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) - headers[:subjectid] ||= @@subjectid - raise ArgumentError, "Invalid URI: '#{uri}'" unless URI.valid? uri - # make sure that no header parameters are set in the payload - [:accept,:content_type,:subjectid].each do |header| - if defined? $aa || URI(uri).host == URI($aa[:uri]).host - else - raise ArgumentError, "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header] - end - end - - # create request - args={} - args[:method] = method - args[:url] = uri - args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty? - args[:timeout] = 1800 - args[:payload] = payload - headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems - args[:headers] = headers - - $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post" - - @request = RestClient::Request.new(args) - # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader) - @response = @request.execute do |response, request, result| - if [301, 302, 307].include? response.code and request.method == :get - response.follow_redirection(request, result) -=begin - elsif response.code >= 400 and !URI.task?(uri) - error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first - begin # errors are returned as error reports in json, try to parse - content = JSON.parse(response) - msg = content["message"].to_s - cause = content["errorCause"].to_s - raise if msg.size==0 && cause.size==0 # parsing failed - rescue # parsing error failed, use complete content as message - msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}" - cause = nil - end - Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method -=end - else - response - end - end - end - end - -=begin - #@return [Array] of hashes with error code, method and class - def self.known_errors - errors = [] - RestClient::STATUSES.each do |code,k| - if code >= 400 - method = k.underscore.gsub(/ |'/,'_') - method += "_error" unless method.match(/_error$/) - klass = method.split("_").collect{|s| s.capitalize}.join("") - errors << {:code => code, :method => method.to_sym, :class => klass} - end - end - errors - end -=end - - end -end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb deleted file mode 100644 index d034cd1..0000000 --- a/lib/train-test-validation.rb +++ /dev/null @@ -1,76 +0,0 @@ -module OpenTox - - module Validation - - # Training test set validation - class TrainTest < Validation - - field :training_dataset_id, type: BSON::ObjectId - field :test_dataset_id, type: BSON::ObjectId - - # Create a training test set validation - # @param [OpenTox::Model::Lazar] - # @param [OpenTox::Dataset] training dataset - # @param [OpenTox::Dataset] test dataset - # @return [OpenTox::Validation::TrainTest] - def self.create model, training_set, test_set - - validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms - validation_model.save - predictions = validation_model.predict test_set.substances - predictions.each do |cid,prediction| - prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value] - end - predictions.select!{|cid,p| p[:value] and p[:measurements]} - # remove neighbors to avoid mongos file size limit error on large datasets - predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i) - validation = self.new( - :model_id => validation_model.id, - :test_dataset_id => test_set.id, - :predictions => predictions - ) - validation.save - validation - end - - # Get test dataset - # @return [OpenTox::Dataset] - def test_dataset - Dataset.find test_dataset_id - end - - # Get training dataset - # @return [OpenTox::Dataset] - def training_dataset - Dataset.find training_dataset_id - end - - end - - # Training test set validation for classification models - class ClassificationTrainTest < TrainTest - include ClassificationStatistics - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :probability_plot_id, type: BSON::ObjectId - end - - # Training test set validation for regression models - class RegressionTrainTest < TrainTest - include RegressionStatistics - field :rmse, type: Float, default:0 - field :mae, type: Float, default:0 - field :r_squared, type: Float - field :within_prediction_interval, type: Integer, default:0 - field :out_of_prediction_interval, type: Integer, default:0 - field :correlation_plot_id, type: BSON::ObjectId - end - - end - -end diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb deleted file mode 100644 index fc10cd4..0000000 --- a/lib/unique_descriptors.rb +++ /dev/null @@ -1,120 +0,0 @@ -# set of non redundant descriptors, faster algorithms are preferred -# TODO: -# select logP algorithm -# select l5 algorithm -# use smarts matcher for atom counts -# check correlations -UNIQUEDESCRIPTORS = [ - "Openbabel.abonds", #Number of aromatic bonds - "Openbabel.atoms", #Number of atoms - "Openbabel.bonds", #Number of bonds - "Openbabel.dbonds", #Number of double bonds - "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib) - "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib) - "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib) - #"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! - "Openbabel.logP", #octanol/water partition coefficient - "Openbabel.MP", #Melting point - "Openbabel.MR", #molar refractivity - "Openbabel.MW", #Molecular Weight filter - "Openbabel.nF", #Number of Fluorine Atoms - "Openbabel.sbonds", #Number of single bonds - "Openbabel.tbonds", #Number of triple bonds - "Openbabel.TPSA", #topological polar surface area - "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and - "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens). - "Cdk.AcidicGroupCount", #Returns the number of acidic groups. - #"Cdk.AminoAcidCount", #Returns the number of amino acids found in the system - #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule. - #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule. - #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type. - "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges - "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight - "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability - "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. . - "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens). - "Cdk.BasicGroupCount", #Returns the number of basic groups. - #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order. - "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information - "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization - "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6 - "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7 - "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6 - "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7 - "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information. - "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework - "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06} - "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule. - #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors. - #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors. - "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states. - # TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable. - #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential. - "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices. - "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments - "Cdk.LargestChain", #Returns the number of atoms in the largest chain - "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain - "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth. - "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain - "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O - #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms . - "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration. - "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule. - "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule. - "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule. - #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five. - #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions . - "Cdk.VABC", #Describes the volume of a molecule. - "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule. - "Cdk.WHIM", #Holistic descriptors described by Todeschini et al . - #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight - "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching. - "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number. - "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP. - "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms. - "Joelib.count.NumberOfS", #no description available - "Joelib.count.NumberOfP", #no description available - "Joelib.count.NumberOfO", #no description available - "Joelib.count.NumberOfN", #no description available - #"Joeli#.count.AromaticBonds", #no description available - "Joelib.count.NumberOfI", #no description available - "Joelib.count.NumberOfF", #no description available - "Joelib.count.NumberOfC", #no description available - "Joelib.count.NumberOfB", #no description available - "Joelib.count.HydrophobicGroups", #no description available - #"Joelib.KierShape3", #no description available - #"Joelib.KierShape2", #no description available - #"Joelib.KierShape1", #no description available - #"Joelib.count.AcidicGroups", #no description available - "Joelib.count.AliphaticOHGroups", #no description available - #"Joelib.count.NumberOfAtoms", #no description available - "Joelib.TopologicalRadius", #no description available - "Joelib.GeometricalShapeCoefficient", #no description available - #"Joelib.MolecularWeight", #no description available - "Joelib.FractionRotatableBonds", #no description available - #"Joeli..count.HBD2", #no description available - #"Joelib.count.HBD1", #no description available - "Joelib.LogP", #no description available - "Joelib.GraphShapeCoefficient", #no description available - "Joelib.count.BasicGroups", #no description available - #"Joelib.count.RotatableBonds", #no description available - "Joelib.count.HeavyBonds", #no description available - "Joelib.PolarSurfaceArea", #no description available - #"Joelib.ZagrebIndex1", #no description available - "Joelib.GeometricalRadius", #no description available - "Joelib.count.SO2Groups", #no description available - "Joelib.count.AromaticOHGroups", #no description available - "Joelib.GeometricalDiameter", #no description available - #"Joelib.MolarRefractivity", #no description available - "Joelib.count.NumberOfCl", #no description available - "Joelib.count.OSOGroups", #no description available - "Joelib.count.NumberOfBr", #no description available - "Joelib.count.NO2Groups", #no description available - "Joelib.count.HeteroCycles", #no description available - #"Joelib.count.HBA2", #no description available - #"Joelib.count.HBA1", #no description available - #"Joelib.count.NumberOfBonds", #no description available - "Joelib.count.SOGroups", #no description available - "Joelib.TopologicalDiameter", #no description available - "Joelib.count.NumberOfHal", #no description available -] diff --git a/lib/validation.rb b/lib/validation.rb deleted file mode 100644 index 9402361..0000000 --- a/lib/validation.rb +++ /dev/null @@ -1,26 +0,0 @@ -module OpenTox - - module Validation - - # Base validation class - class Validation - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "validations" - field :name, type: String - field :model_id, type: BSON::ObjectId - field :predictions, type: Hash, default: {} - field :finished_at, type: Time - - # Get model - # @return [OpenTox::Model::Lazar] - def model - Model::Lazar.find model_id - end - - end - - end - -end -- cgit v1.2.3