summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgebele <gebele@in-silico.ch>2017-01-18 14:17:02 +0000
committergebele <gebele@in-silico.ch>2017-01-18 14:17:02 +0000
commitdd8b986e198552ce865aa5500335043f60a0eef3 (patch)
treef5c06159d0a529f263ea25f6354f89825fc41410
parent76912e72cf52c2a10423226ababb2b6d7f4b84ca (diff)
parentd7504cc422bbaeee3546589d87e7baeb4e977c0b (diff)
fixed merge
-rw-r--r--README.md65
-rw-r--r--lib/algorithm.rb1
-rw-r--r--lib/caret.rb13
-rw-r--r--lib/classification.rb7
-rw-r--r--lib/compound.rb55
-rw-r--r--lib/crossvalidation.rb22
-rw-r--r--lib/dataset.rb33
-rw-r--r--lib/experiment.rb99
-rw-r--r--lib/feature.rb7
-rw-r--r--lib/feature_selection.rb5
-rw-r--r--lib/import.rb6
-rw-r--r--lib/leave-one-out-validation.rb6
-rw-r--r--lib/model.rb80
-rw-r--r--lib/nanoparticle.rb19
-rw-r--r--lib/overwrite.rb57
-rw-r--r--lib/physchem.rb28
-rw-r--r--lib/regression.rb5
-rw-r--r--lib/rest-client-wrapper.rb1
-rw-r--r--lib/similarity.rb24
-rw-r--r--lib/substance.rb1
-rw-r--r--lib/train-test-validation.rb12
-rw-r--r--lib/validation-statistics.rb19
-rw-r--r--lib/validation.rb3
-rw-r--r--test/feature.rb4
-rw-r--r--test/model-nanoparticle.rb7
-rw-r--r--test/nanomaterial-model-validation.rb5
-rw-r--r--test/setup.rb6
-rw-r--r--test/validation-regression.rb5
28 files changed, 430 insertions, 165 deletions
diff --git a/README.md b/README.md
index 658e440..28ed18f 100644
--- a/README.md
+++ b/README.md
@@ -26,10 +26,73 @@ Installation
The output should give you more verbose information that can help in debugging (e.g. to identify missing libraries).
+Tutorial
+--------
+
+Execute the following commands either from an interactive Ruby shell or a Ruby script:
+
+### Create and use `lazar` models for small molecules
+
+#### Create a training dataset
+
+ Create a CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at [Github](https://github.com/opentox/lazar-public-data).
+
+#### Create and validate a `lazar` model with default algorithms and parameters
+
+ `validated_model = Model::Validation.create_from_csv_file EPAFHM_log10.csv`
+
+ This command will create a `lazar` model and validate it with three independent 10-fold crossvalidations.
+
+#### Inspect crossvalidation results
+
+ `validated_model.crossvalidations`
+
+#### Predict a new compound
+
+ Create a compound
+
+ `compound = Compound.from_smiles "NC(=O)OCCC"`
+
+ Predict Fathead Minnow Acute Toxicity
+
+ `validated_model.predict compound`
+
+#### Experiment with other algorithms
+
+ You can pass algorithms parameters to the `Model::Validation.create_from_csv_file` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions.
+
+### Create and use `lazar` nanoparticle models
+
+#### Create and validate a `nano-lazar` model from eNanoMapper with default algorithms and parameters
+
+ `validated_model = Model::Validation.create_from_enanomapper`
+
+ This command will mirror the eNanoMapper database in the local database, create a `nano-lazar` model and validate it with five independent 10-fold crossvalidations.
+
+#### Inspect crossvalidation results
+
+ `validated_model.crossvalidations`
+
+#### Predict nanoparticle toxicities
+
+ Choose a random nanoparticle from the "Potein Corona" dataset
+ ```
+ training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+ nanoparticle = training_dataset.substances.shuffle.first
+ ```
+
+ Predict the "Net Cell Association" endpoint
+
+ `validated_model.predict nanoparticle`
+
+#### Experiment with other datasets, endpoints and algorithms
+
+ You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. Detailed documentation and validation results can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
+
Documentation
-------------
* [API documentation](http://rdoc.info/gems/lazar)
Copyright
---------
-Copyright (c) 2009-2016 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
+Copyright (c) 2009-2017 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 0e4b93a..f70ac1a 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -2,6 +2,7 @@ module OpenTox
module Algorithm
+ # Execute an algorithm with parameters
def self.run algorithm, parameters=nil
klass,method = algorithm.split('.')
Object.const_get(klass).send(method,parameters)
diff --git a/lib/caret.rb b/lib/caret.rb
index 7e4f771..f5c2bde 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -1,9 +1,17 @@
module OpenTox
module Algorithm
+ # Ruby interface for the R caret package
+ # Caret model list: https://topepo.github.io/caret/modelList.html
class Caret
- # model list: https://topepo.github.io/caret/modelList.html
+ # Create a local R caret model and make a prediction
+ # @param [Array<Float,Bool>] dependent_variables
+ # @param [Array<Array<Float,Bool>>] independent_variables
+ # @param [Array<Float>] weights
+ # @param [String] Caret method
+ # @param [Array<Float,Bool>] query_variables
+ # @return [Hash]
def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
remove = []
# remove independent_variables with single values
@@ -77,12 +85,13 @@ module OpenTox
end
- # call caret methods dynamically, e.g. Caret.pls
+ # Call caret methods dynamically, e.g. Caret.pls
def self.method_missing(sym, *args, &block)
args.first[:method] = sym.to_s
self.create_model_and_predict args.first
end
+ # Convert Ruby values to R values
def self.to_r v
return "F" if v == false
return "T" if v == true
diff --git a/lib/classification.rb b/lib/classification.rb
index e8c179f..638492b 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -1,9 +1,14 @@
module OpenTox
module Algorithm
+ # Classification algorithms
class Classification
- def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:
+ # Weighted majority vote
+ # @param [Array<TrueClass,FalseClass>] dependent_variables
+ # @param [Array<Float>] weights
+ # @return [Hash]
+ def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:nil
class_weights = {}
dependent_variables.each_with_index do |v,i|
class_weights[v] ||= []
diff --git a/lib/compound.rb b/lib/compound.rb
index 8a1143b..bfe69e3 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -2,6 +2,7 @@ CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
module OpenTox
+ # Small molecules with defined chemical structures
class Compound < Substance
require_relative "unique_descriptors.rb"
DEFAULT_FINGERPRINT = "MP2D"
@@ -28,6 +29,9 @@ module OpenTox
compound
end
+ # Create chemical fingerprint
+ # @param [String] fingerprint type
+ # @return [Array<String>]
def fingerprint type=DEFAULT_FINGERPRINT
unless fingerprints[type]
return [] unless self.smiles
@@ -75,6 +79,9 @@ module OpenTox
fingerprints[type]
end
+ # Calculate physchem properties
+ # @param [Array<Hash>] list of descriptors
+ # @return [Array<Float>]
def calculate_properties descriptors=PhysChem::OPENBABEL
calculated_ids = properties.keys
# BSON::ObjectId instances are not allowed as keys in a BSON document.
@@ -96,6 +103,10 @@ module OpenTox
descriptors.collect{|d| properties[d.id.to_s]}
end
+ # Match a SMARTS substructure
+ # @param [String] smarts
+ # @param [TrueClass,FalseClass] count matches or return true/false
+ # @return [TrueClass,FalseClass,Fixnum]
def smarts_match smarts, count=false
obconversion = OpenBabel::OBConversion.new
obmol = OpenBabel::OBMol.new
@@ -116,8 +127,8 @@ module OpenTox
# Create a compound from smiles string
# @example
# compound = OpenTox::Compound.from_smiles("c1ccccc1")
- # @param [String] smiles Smiles string
- # @return [OpenTox::Compound] Compound
+ # @param [String] smiles
+ # @return [OpenTox::Compound]
def self.from_smiles smiles
if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
$logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
@@ -132,9 +143,9 @@ module OpenTox
end
end
- # Create a compound from inchi string
- # @param inchi [String] smiles InChI string
- # @return [OpenTox::Compound] Compound
+ # Create a compound from InChI string
+ # @param [String] InChI
+ # @return [OpenTox::Compound]
def self.from_inchi inchi
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
smiles = obconversion(inchi,"inchi","can")
@@ -145,9 +156,9 @@ module OpenTox
end
end
- # Create a compound from sdf string
- # @param sdf [String] smiles SDF string
- # @return [OpenTox::Compound] Compound
+ # Create a compound from SDF
+ # @param [String] SDF
+ # @return [OpenTox::Compound]
def self.from_sdf sdf
# do not store sdf because it might be 2D
Compound.from_smiles obconversion(sdf,"sdf","can")
@@ -156,40 +167,38 @@ module OpenTox
# Create a compound from name. Relies on an external service for name lookups.
# @example
# compound = OpenTox::Compound.from_name("Benzene")
- # @param name [String] can be also an InChI/InChiKey, CAS number, etc
- # @return [OpenTox::Compound] Compound
+ # @param [String] name, can be also an InChI/InChiKey, CAS number, etc
+ # @return [OpenTox::Compound]
def self.from_name name
Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
end
# Get InChI
- # @return [String] InChI string
+ # @return [String]
def inchi
unless self["inchi"]
-
result = obconversion(smiles,"smi","inchi")
- #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
update(:inchi => result.chomp) if result and !result.empty?
end
self["inchi"]
end
# Get InChIKey
- # @return [String] InChIKey string
+ # @return [String]
def inchikey
update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
self["inchikey"]
end
# Get (canonical) smiles
- # @return [String] Smiles string
+ # @return [String]
def smiles
update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"]
self["smiles"]
end
- # Get sdf
- # @return [String] SDF string
+ # Get SDF
+ # @return [String]
def sdf
if self.sdf_id.nil?
sdf = obconversion(smiles,"smi","sdf")
@@ -209,7 +218,6 @@ module OpenTox
update(:svg_id => $gridfs.insert_one(file))
end
$gridfs.find_one(_id: self.svg_id).data
-
end
# Get png image
@@ -223,26 +231,27 @@ module OpenTox
update(:png_id => $gridfs.insert_one(file))
end
Base64.decode64($gridfs.find_one(_id: self.png_id).data)
-
end
# Get all known compound names. Relies on an external service for name lookups.
# @example
# names = compound.names
- # @return [String] Compound names
+ # @return [Array<String>]
def names
update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
self["names"]
end
- # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
+ # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem
+ # @return [String]
def cid
pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
self["cid"]
end
- # @return [String] ChEMBL database compound id, derieved via restcall to chembl
+ # Get ChEMBL database compound id, obtained via REST call to ChEMBL
+ # @return [String]
def chemblid
# https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
@@ -292,7 +301,7 @@ module OpenTox
mg.to_f/molecular_weight
end
- # Calculate molecular weight of Compound with OB and store it in object
+ # Calculate molecular weight of Compound with OB and store it in compound object
# @return [Float] molecular weight
def molecular_weight
mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index bcb3ccf..75c5db5 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -1,10 +1,16 @@
module OpenTox
module Validation
+
+ # Crossvalidation
class CrossValidation < Validation
field :validation_ids, type: Array, default: []
field :folds, type: Integer, default: 10
+ # Create a crossvalidation
+ # @param [OpenTox::Model::Lazar]
+ # @param [Fixnum] number of folds
+ # @return [OpenTox::Validation::CrossValidation]
def self.create model, n=10
$logger.debug model.algorithms
klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
@@ -41,14 +47,20 @@ module OpenTox
cv
end
+ # Get execution time
+ # @return [Fixnum]
def time
finished_at - created_at
end
+ # Get individual validations
+ # @return [Array<OpenTox::Validation>]
def validations
validation_ids.collect{|vid| TrainTest.find vid}
end
+ # Get predictions for all compounds
+ # @return [Array<Hash>]
def predictions
predictions = {}
validations.each{|v| predictions.merge!(v.predictions)}
@@ -56,6 +68,7 @@ module OpenTox
end
end
+ # Crossvalidation of classification models
class ClassificationCrossValidation < CrossValidation
include ClassificationStatistics
field :accept_values, type: Array
@@ -68,6 +81,7 @@ module OpenTox
field :probability_plot_id, type: BSON::ObjectId
end
+ # Crossvalidation of regression models
class RegressionCrossValidation < CrossValidation
include RegressionStatistics
field :rmse, type: Float, default:0
@@ -78,10 +92,16 @@ module OpenTox
field :correlation_plot_id, type: BSON::ObjectId
end
+ # Independent repeated crossvalidations
class RepeatedCrossValidation < Validation
field :crossvalidation_ids, type: Array, default: []
field :correlation_plot_id, type: BSON::ObjectId
+ # Create repeated crossvalidations
+ # @param [OpenTox::Model::Lazar]
+ # @param [Fixnum] number of folds
+ # @param [Fixnum] number of repeats
+ # @return [OpenTox::Validation::RepeatedCrossValidation]
def self.create model, folds=10, repeats=3
repeated_cross_validation = self.new
repeats.times do |n|
@@ -92,6 +112,8 @@ module OpenTox
repeated_cross_validation
end
+ # Get crossvalidations
+ # @return [OpenTox::Validation::CrossValidation]
def crossvalidations
crossvalidation_ids.collect{|id| CrossValidation.find(id)}
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index ab55294..44690e1 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -3,32 +3,43 @@ require 'tempfile'
module OpenTox
+ # Collection of substances and features
class Dataset
field :data_entries, type: Hash, default: {}
# Readers
+ # Get all compounds
+ # @return [Array<OpenTox::Compound>]
def compounds
substances.select{|s| s.is_a? Compound}
end
+ # Get all nanoparticles
+ # @return [Array<OpenTox::Nanoparticle>]
def nanoparticles
substances.select{|s| s.is_a? Nanoparticle}
end
# Get all substances
+ # @return [Array<OpenTox::Substance>]
def substances
@substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
@substances
end
# Get all features
+ # @return [Array<OpenTox::Feature>]
def features
@features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
@features
end
+ # Get all values for a given substance and feature
+ # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id
+ # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id
+ # @return [TrueClass,FalseClass,Float]
def values substance,feature
substance = substance.id if substance.is_a? Substance
feature = feature.id if feature.is_a? Feature
@@ -41,6 +52,10 @@ module OpenTox
# Writers
+ # Add a value for a given substance and feature
+ # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id
+ # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id
+ # @param [TrueClass,FalseClass,Float]
def add(substance,feature,value)
substance = substance.id if substance.is_a? Substance
feature = feature.id if feature.is_a? Feature
@@ -87,7 +102,7 @@ module OpenTox
# Serialisation
- # converts dataset to csv format including compound smiles as first column, other column headers are feature names
+ # Convert dataset to csv format including compound smiles as first column, other column headers are feature names
# @return [String]
def to_csv(inchi=false)
CSV.generate() do |csv|
@@ -130,6 +145,9 @@ module OpenTox
#end
# Create a dataset from CSV file
+ # @param [File]
+ # @param [TrueClass,FalseClass] accept or reject empty values
+ # @return [OpenTox::Dataset]
def self.from_csv_file file, accept_empty_values=false
source = file
name = File.basename(file,".*")
@@ -145,8 +163,10 @@ module OpenTox
dataset
end
- # parse data in tabular format (e.g. from csv)
- # does a lot of guesswork in order to determine feature types
+ # Parse data in tabular format (e.g. from csv)
+ # does a lot of guesswork in order to determine feature types
+ # @param [Array<Array>]
+ # @param [TrueClass,FalseClass] accept or reject empty values
def parse_table table, accept_empty_values
# features
@@ -225,6 +245,7 @@ module OpenTox
save
end
+ # Delete dataset
def delete
compounds.each{|c| c.dataset_ids.delete id.to_s}
super
@@ -238,14 +259,20 @@ module OpenTox
field :prediction_feature_id, type: BSON::ObjectId
field :predictions, type: Hash, default: {}
+ # Get prediction feature
+ # @return [OpenTox::Feature]
def prediction_feature
Feature.find prediction_feature_id
end
+ # Get all compounds
+ # @return [Array<OpenTox::Compound>]
def compounds
substances.select{|s| s.is_a? Compound}
end
+ # Get all substances
+ # @return [Array<OpenTox::Substance>]
def substances
predictions.keys.collect{|id| Substance.find id}
end
diff --git a/lib/experiment.rb b/lib/experiment.rb
deleted file mode 100644
index 0dfdf86..0000000
--- a/lib/experiment.rb
+++ /dev/null
@@ -1,99 +0,0 @@
-module OpenTox
-
- class Experiment
- field :dataset_ids, type: Array
- field :model_settings, type: Array, default: []
- field :results, type: Hash, default: {}
-
- def run
- dataset_ids.each do |dataset_id|
- dataset = Dataset.find(dataset_id)
- results[dataset_id.to_s] = []
- model_settings.each do |setting|
- setting = setting.dup
- model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
- model = Object.const_get(model_algorithm).create dataset, setting
- $logger.debug model
- model.save
- repeated_crossvalidation = RepeatedCrossValidation.create model
- results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
- end
- end
- save
- end
-
- def report
- # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
- report = {}
- report[:name] = name
- report[:experiment_id] = self.id.to_s
- report[:results] = {}
- parameters = []
- dataset_ids.each do |dataset_id|
- dataset_name = Dataset.find(dataset_id).name
- report[:results][dataset_name] = {}
- report[:results][dataset_name][:anova] = {}
- report[:results][dataset_name][:data] = []
- # TODO results[dataset_id.to_s] does not exist
- results[dataset_id.to_s].each do |result|
- model = Model::Lazar.find(result[:model_id])
- repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
- crossvalidations = repeated_cv.crossvalidations
- if crossvalidations.first.is_a? ClassificationCrossValidation
- parameters = [:accuracy,:true_rate,:predictivity]
- elsif crossvalidations.first.is_a? RegressionCrossValidation
- parameters = [:rmse,:mae,:r_squared]
- end
- summary = {}
- [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
- summary[key] = model[key]
- end
- summary[:nr_instances] = crossvalidations.first.nr_instances
- summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
- summary[:time] = crossvalidations.collect{|cv| cv.time}
- parameters.each do |param|
- summary[param] = crossvalidations.collect{|cv| cv.send(param)}
- end
- report[:results][dataset_name][:data] << summary
- end
- end
- report[:results].each do |dataset,results|
- ([:time,:nr_unpredicted]+parameters).each do |param|
- experiments = []
- outcome = []
- results[:data].each_with_index do |result,i|
- result[param].each do |p|
- experiments << i
- p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
- outcome << p
- end
- end
- begin
- R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
- R.eval "experiment_nr = factor(experiment_nr)"
- R.assign "outcome", outcome
- R.eval "data = data.frame(experiment_nr,outcome)"
- # one-way ANOVA
- R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
- # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
- p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
- # aequivalent
- # sum = R.eval("summary(fit)")
- #p_value = sum.to_ruby.first.last.first
- rescue
- p_value = nil
- end
- report[:results][dataset][:anova][param] = p_value
-=begin
-=end
- end
- end
- report
- end
-
- def summary
- report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
- end
- end
-
-end
diff --git a/lib/feature.rb b/lib/feature.rb
index 0ca4d41..f811aef 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -8,10 +8,14 @@ module OpenTox
field :unit, type: String
field :conditions, type: Hash
+ # Is it a nominal feature
+ # @return [TrueClass,FalseClass]
def nominal?
self.class == NominalFeature
end
+ # Is it a numeric feature
+ # @return [TrueClass,FalseClass]
def numeric?
self.class == NumericFeature
end
@@ -30,6 +34,9 @@ module OpenTox
class Smarts < NominalFeature
field :smarts, type: String
index "smarts" => 1
+ # Create feature from SMARTS string
+ # @param [String]
+ # @return [OpenTox::Feature]
def self.from_smarts smarts
self.find_or_create_by :smarts => smarts
end
diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb
index 65f9752..c596b1f 100644
--- a/lib/feature_selection.rb
+++ b/lib/feature_selection.rb
@@ -1,13 +1,16 @@
module OpenTox
module Algorithm
+ # Feature selection algorithms
class FeatureSelection
+ # Select features correlated to the models prediction feature
+ # @param [OpenTox::Model::Lazar]
def self.correlation_filter model
relevant_features = {}
R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
model.descriptor_weights = []
- selected_variables = []
+ selected_variables = []
selected_descriptor_ids = []
model.independent_variables.each_with_index do |v,i|
v.collect!{|n| to_r(n)}
diff --git a/lib/import.rb b/lib/import.rb
index 7a68335..96e7ad1 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -1,12 +1,14 @@
module OpenTox
+ # Import data from external databases
module Import
class Enanomapper
include OpenTox
- # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
+ # Import from eNanoMapper
def self.import
+ # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
datasets = {}
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
bundles.each do |bundle|
@@ -20,6 +22,7 @@ module OpenTox
uri = c["component"]["compound"]["URI"]
uri = CGI.escape File.join(uri,"&media=application/json")
data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
+ source = data["dataEntry"][0]["compound"]["URI"]
smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
names = []
names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
@@ -31,6 +34,7 @@ module OpenTox
else
compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
end
+ compound.source = source
compound.save
if c["relation"] == "HAS_CORE"
core_id = compound.id.to_s
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 538b7b3..8d22018 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -2,8 +2,12 @@ module OpenTox
module Validation
+ # Leave one out validation
class LeaveOneOut < Validation
+ # Create a leave one out validation
+ # @param [OpenTox::Model::Lazar]
+ # @return [OpenTox::Validation::LeaveOneOut]
def self.create model
bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
$logger.debug "#{model.name}: LOO validation started"
@@ -32,6 +36,7 @@ module OpenTox
end
+ # Leave one out validation for classification models
class ClassificationLeaveOneOut < LeaveOneOut
include ClassificationStatistics
field :accept_values, type: Array
@@ -44,6 +49,7 @@ module OpenTox
field :confidence_plot_id, type: BSON::ObjectId
end
+ # Leave one out validation for regression models
class RegressionLeaveOneOut < LeaveOneOut
include RegressionStatistics
field :rmse, type: Float, default: 0
diff --git a/lib/model.rb b/lib/model.rb
index 9c4a93f..b18610d 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -9,6 +9,8 @@ module OpenTox
include Mongoid::Timestamps
store_in collection: "models"
+ attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
+
field :name, type: String
field :creator, type: String, default: __FILE__
field :algorithms, type: Hash, default:{}
@@ -17,7 +19,7 @@ module OpenTox
field :prediction_feature_id, type: BSON::ObjectId
field :dependent_variables, type: Array, default:[]
field :descriptor_ids, type:Array, default:[]
- field :independent_variables, type: Array, default:[]
+ field :independent_variables_id, type: BSON::ObjectId
field :fingerprints, type: Array, default:[]
field :descriptor_weights, type: Array, default:[]
field :descriptor_means, type: Array, default:[]
@@ -25,7 +27,15 @@ module OpenTox
field :scaled_variables, type: Array, default:[]
field :version, type: Hash, default:{}
- def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
+ # Create a lazar model
+ # @param [OpenTox::Dataset] training_dataset
+ # @param [OpenTox::Feature, nil] prediction_feature
+ # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
+ # @param [Hash, nil] algorithms
+ # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
+ #
+ # @return [OpenTox::Model::Lazar]
+ def self.create prediction_feature:nil, training_dataset:, algorithms:{}
bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
prediction_feature = training_dataset.features.first unless prediction_feature
# TODO: prediction_feature without training_dataset: use all available data
@@ -119,6 +129,7 @@ module OpenTox
end
descriptor_method = model.algorithms[:descriptors][:method]
+ model.independent_variables = []
case descriptor_method
# parse fingerprints
when "fingerprint"
@@ -177,8 +188,12 @@ module OpenTox
model
end
+ # Predict a substance (compound or nanoparticle)
+ # @param [OpenTox::Substance]
+ # @return [Hash]
def predict_substance substance
+ @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
when /tanimoto/ # binary features
similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
@@ -234,7 +249,7 @@ module OpenTox
neighbor_dependent_variables << dependent_variables[i]
independent_variables.each_with_index do |c,j|
neighbor_independent_variables[j] ||= []
- neighbor_independent_variables[j] << independent_variables[j][i]
+ neighbor_independent_variables[j] << @independent_variables[j][i]
end
end
end
@@ -256,6 +271,9 @@ module OpenTox
prediction
end
+ # Predict a substance (compound or nanoparticle), an array of substances or a dataset
+ # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
+ # @return [Hash, Array<Hash>, OpenTox::Dataset]
def predict object
training_dataset = Dataset.find training_dataset_id
@@ -302,34 +320,62 @@ module OpenTox
end
+ # Save the model
+ # Stores independent_variables in GridFS to avoid Mongo database size limit problems
+ def save
+ file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
+ self.independent_variables_id = $gridfs.insert_one(file)
+ super
+ end
+
+ # Get independent variables
+ # @return [Array<Array>]
+ def independent_variables
+ @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
+ @independent_variables
+ end
+
+ # Get training dataset
+ # @return [OpenTox::Dataset]
def training_dataset
Dataset.find(training_dataset_id)
end
+ # Get prediction feature
+ # @return [OpenTox::Feature]
def prediction_feature
Feature.find(prediction_feature_id)
end
+ # Get training descriptors
+ # @return [Array<OpenTox::Feature>]
def descriptors
descriptor_ids.collect{|id| Feature.find(id)}
end
+ # Get training substances
+ # @return [Array<OpenTox::Substance>]
def substances
substance_ids.collect{|id| Substance.find(id)}
end
+ # Are fingerprints used as descriptors
+ # @return [TrueClass, FalseClass]
def fingerprints?
algorithms[:descriptors][:method] == "fingerprint" ? true : false
end
end
+ # Classification model
class LazarClassification < Lazar
end
+ # Regression model
class LazarRegression < Lazar
end
+ # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
class Validation
include OpenTox
@@ -343,42 +389,64 @@ module OpenTox
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
+ # Predict a substance (compound or nanoparticle), an array of substances or a dataset
+ # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
+ # @return [Hash, Array<Hash>, OpenTox::Dataset]
def predict object
model.predict object
end
+ # Get training dataset
+ # @return [OpenTox::Dataset]
def training_dataset
model.training_dataset
end
+ # Get lazar model
+ # @return [OpenTox::Model::Lazar]
def model
Lazar.find model_id
end
+ # Get algorithms
+ # @return [Hash]
def algorithms
model.algorithms
end
+ # Get prediction feature
+ # @return [OpenTox::Feature]
def prediction_feature
model.prediction_feature
end
+ # Get repeated crossvalidations
+ # @return [OpenTox::Validation::RepeatedCrossValidation]
def repeated_crossvalidation
OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
end
+ # Get crossvalidations
+ # @return [Array<OpenTox::CrossValidation]
def crossvalidations
repeated_crossvalidation.crossvalidations
end
+ # Is it a regression model
+ # @return [TrueClass, FalseClass]
def regression?
model.is_a? LazarRegression
end
+ # Is it a classification model
+ # @return [TrueClass, FalseClass]
def classification?
model.is_a? LazarClassification
end
+ # Create and validate a lazar model from a csv file with training data and a json file with metadata
+ # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
+ # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
def self.from_csv_file file
metadata_file = file.sub(/csv$/,"json")
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
@@ -391,6 +459,12 @@ module OpenTox
model_validation
end
+ # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
+ # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+ # @param [OpenTox::Dataset, nil] training_dataset
+ # @param [OpenTox::Feature, nil] prediction_feature
+ # @param [Hash, nil] algorithms
+ # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
# find/import training_dataset
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 06db4d2..73d5f8b 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -1,25 +1,36 @@
module OpenTox
+ # Nanoparticles
class Nanoparticle < Substance
include OpenTox
field :core_id, type: String, default: nil
field :coating_ids, type: Array, default: []
+ # Get core compound
+ # @return [OpenTox::Compound]
def core
Compound.find core_id
end
+ # Get coatings
+ # @return [Array<OpenTox::Compound>]
def coating
coating_ids.collect{|i| Compound.find i }
end
+ # Get nanoparticle fingerprint (union of core and coating fingerprints)
+ # @param [String] fingerprint type
+ # @return [Array<String>]
def fingerprint type=DEFAULT_FINGERPRINT
core_fp = core.fingerprint type
coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact
(core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact
end
+ # Calculate physchem properties
+ # @param [Array<Hash>] list of descriptors
+ # @return [Array<Float>]
def calculate_properties descriptors=PhysChem::OPENBABEL
if core.smiles and !coating.collect{|c| c.smiles}.compact.empty?
core_prop = core.calculate_properties descriptors
@@ -28,6 +39,10 @@ module OpenTox
end
end
+ # Add (measured) feature values
+ # @param [OpenTox::Feature]
+ # @param [TrueClass,FalseClass,Float]
+ # @param [OpenTox::Dataset]
def add_feature feature, value, dataset
unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand
case feature.category
@@ -55,6 +70,10 @@ module OpenTox
end
end
+ # Parse values from Ambit database
+ # @param [OpenTox::Feature]
+ # @param [TrueClass,FalseClass,Float]
+ # @param [OpenTox::Dataset]
def parse_ambit_value feature, v, dataset
# TODO add study id to warnings
v.delete "unit"
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index 31d30c9..91bc9e1 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -2,41 +2,51 @@ require "base64"
class Object
# An object is blank if it's false, empty, or a whitespace string.
# For example, "", " ", +nil+, [], and {} are all blank.
+ # @return [TrueClass,FalseClass]
def blank?
respond_to?(:empty?) ? empty? : !self
end
+ # Is it a numeric object
+ # @return [TrueClass,FalseClass]
def numeric?
true if Float(self) rescue false
end
# Returns dimension of nested arrays
+ # @return [Fixnum]
def dimension
self.class == Array ? 1 + self[0].dimension : 0
end
end
class Numeric
+ # Convert number to percent
+ # @return [Float]
def percent_of(n)
self.to_f / n.to_f * 100.0
end
end
class Float
- # round to n significant digits
- # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
+ # Round to n significant digits
+ # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
+ # @param [Fixnum]
+ # @return [Float]
def signif(n)
Float("%.#{n}g" % self)
end
- # converts -10 logarithmized values back
+ # Convert -10 log values to original values
+ # @return [Float]
def delog10
10**(-1*self)
end
end
module Enumerable
- # @return [Array] only the duplicates of an enumerable
+ # Get duplicates
+ # @return [Array]
def duplicates
inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
end
@@ -51,7 +61,10 @@ module Enumerable
end
class String
- # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
+ # Convert camel-case to underscore-case
+ # @example
+ # OpenTox::SuperModel -> open_tox/super_model
+ # @return [String]
def underscore
self.gsub(/::/, '/').
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
@@ -60,7 +73,7 @@ class String
downcase
end
- # convert strings to boolean values
+ # Convert strings to boolean values
# @return [TrueClass,FalseClass] true or false
def to_boolean
return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
@@ -71,7 +84,8 @@ class String
end
class File
- # @return [String] mime_type including charset using linux cmd command
+ # Get mime_type including charset using linux file command
+ # @return [String]
def mime_type
`file -ib '#{self.path}'`.chomp
end
@@ -79,7 +93,7 @@ end
class Array
- # Sum up the size of single arrays in an array of arrays
+ # Sum the size of single arrays in an array of arrays
# @param [Array] Array of arrays
# @return [Integer] Sum of size of array elements
def sum_size
@@ -92,33 +106,43 @@ class Array
}
end
- # For symbolic features
+ # Check if the array has just one unique value.
# @param [Array] Array to test.
- # @return [Boolean] Whether the array has just one unique value.
+ # @return [TrueClass,FalseClass]
def zero_variance?
return self.uniq.size == 1
end
+ # Get the median of an array
+ # @return [Numeric]
def median
sorted = self.sort
len = sorted.length
(sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
end
+ # Get the mean of an array
+ # @return [Numeric]
def mean
self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size
end
+ # Get the variance of an array
+ # @return [Numeric]
def sample_variance
m = self.mean
sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 }
sum/(self.compact.length - 1).to_f
end
+ # Get the standard deviation of an array
+ # @return [Numeric]
def standard_deviation
Math.sqrt(self.sample_variance)
end
+ # Convert array values for R
+ # @return [Array]
def for_R
if self.first.is_a?(String)
#"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
@@ -128,6 +152,8 @@ class Array
end
end
+ # Collect array with index
+ # in analogy to each_with_index
def collect_with_index
result = []
self.each_with_index do |elt, idx|
@@ -139,11 +165,15 @@ end
module URI
+ # Is it a https connection
+ # @param [String]
+ # @return [TrueClass,FalseClass]
def self.ssl? uri
URI.parse(uri).instance_of? URI::HTTPS
end
- # @return [Boolean] checks if resource exists by making a HEAD-request
+ # Check if a http resource exists by making a HEAD-request
+ # @return [TrueClass,FalseClass]
def self.accessible?(uri)
parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
http_code = URI.task?(uri) ? 600 : 400
@@ -163,6 +193,9 @@ module URI
false
end
+ # Is the URI valid
+ # @param [String]
+ # @return [TrueClass,FalseClass]
def self.valid? uri
u = URI.parse(uri)
u.scheme!=nil and u.host!=nil
@@ -170,6 +203,8 @@ module URI
false
end
+ # Is the URI a task URI
+ # @param [String]
def self.task? uri
uri =~ /task/ and URI.valid? uri
end
diff --git a/lib/physchem.rb b/lib/physchem.rb
index 327acd8..07df867 100644
--- a/lib/physchem.rb
+++ b/lib/physchem.rb
@@ -39,6 +39,9 @@ module OpenTox
require_relative "unique_descriptors.rb"
+ # Get descriptor features
+ # @param [Hash]
+ # @return [Array<OpenTox::PhysChem>]
def self.descriptors desc=DESCRIPTORS
desc.collect do |name,description|
lib,desc = name.split('.',2)
@@ -46,6 +49,8 @@ module OpenTox
end
end
+ # Get unique descriptor features
+ # @return [Array<OpenTox::PhysChem>]
def self.unique_descriptors
udesc = []
UNIQUEDESCRIPTORS.each do |name|
@@ -64,23 +69,28 @@ module OpenTox
udesc
end
+ # Get OpenBabel descriptor features
+ # @return [Array<OpenTox::PhysChem>]
def self.openbabel_descriptors
descriptors OPENBABEL
end
+ # Get CDK descriptor features
+ # @return [Array<OpenTox::PhysChem>]
def self.cdk_descriptors
descriptors CDK
end
+ # Get JOELIB descriptor features
+ # @return [Array<OpenTox::PhysChem>]
def self.joelib_descriptors
descriptors JOELIB
end
- def calculate compound
- result = send library.downcase,descriptor,compound
- result[self.name]
- end
-
+ # Calculate OpenBabel descriptors
+ # @param [String] descriptor type
+ # @param [OpenTox::Compound]
+ # @return [Hash]
def openbabel descriptor, compound
obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
obmol = OpenBabel::OBMol.new
@@ -90,10 +100,18 @@ module OpenTox
{"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
end
+ # Calculate CDK descriptors
+ # @param [String] descriptor type
+ # @param [OpenTox::Compound]
+ # @return [Hash]
def cdk descriptor, compound
java_descriptor "cdk", descriptor, compound
end
+ # Calculate JOELIB descriptors
+ # @param [String] descriptor type
+ # @param [OpenTox::Compound]
+ # @return [Hash]
def joelib descriptor, compound
java_descriptor "joelib", descriptor, compound
end
diff --git a/lib/regression.rb b/lib/regression.rb
index 3890987..fd2855f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -1,8 +1,13 @@
module OpenTox
module Algorithm
+ # Regression algorithms
class Regression
+ # Weighted average
+ # @param [Array<TrueClass,FalseClass>] dependent_variables
+ # @param [Array<Float>] weights
+ # @return [Hash]
def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
# TODO: prediction_interval
weighted_sum = 0.0
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
index 2073be2..f76a296 100644
--- a/lib/rest-client-wrapper.rb
+++ b/lib/rest-client-wrapper.rb
@@ -1,5 +1,6 @@
module OpenTox
+ # Adjustments to the rest-client gem for OpenTox
class RestClientWrapper
attr_accessor :request, :response
diff --git a/lib/similarity.rb b/lib/similarity.rb
index 0901936..ccbc9d6 100644
--- a/lib/similarity.rb
+++ b/lib/similarity.rb
@@ -2,6 +2,10 @@ module OpenTox
module Algorithm
class Vector
+ # Get dot product
+ # @param [Vector]
+ # @param [Vector]
+ # @return [Numeric]
def self.dot_product(a, b)
products = a.zip(b).map{|a, b| a * b}
products.inject(0) {|s,p| s + p}
@@ -15,6 +19,9 @@ module OpenTox
class Similarity
+ # Get Tanimoto similarity
+ # @param [Array<Array<Float>>]
+ # @return [Float]
def self.tanimoto fingerprints
( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
end
@@ -23,18 +30,28 @@ module OpenTox
#( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
#end
+ # Get Euclidean distance
+ # @param [Array<Array<Float>>]
+ # @return [Float]
def self.euclid scaled_properties
sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
Math.sqrt(sq.inject(0) {|s,c| s + c})
end
- # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+ # Get cosine similarity
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+ # @param [Array<Array<Float>>]
+ # @return [Float]
def self.cosine scaled_properties
scaled_properties = remove_nils scaled_properties
Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1]))
end
- def self.weighted_cosine scaled_properties # [a,b,weights]
+ # Get weighted cosine similarity
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+ # @param [Array<Array<Float>>] [a,b,weights]
+ # @return [Float]
+ def self.weighted_cosine scaled_properties
a,b,w = remove_nils scaled_properties
return cosine(scaled_properties) if w.uniq.size == 1
dot_product = 0
@@ -48,6 +65,9 @@ module OpenTox
dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b))
end
+ # Remove nil values
+ # @param [Array<Array<Float>>] [a,b,weights]
+ # @return [Array<Array<Float>>] [a,b,weights]
def self.remove_nils scaled_properties
a =[]; b = []; w = []
(0..scaled_properties.first.size-1).each do |i|
diff --git a/lib/substance.rb b/lib/substance.rb
index 31c465e..ef49659 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,5 +1,6 @@
module OpenTox
+ # Base class for substances (e.g. compunds, nanoparticles)
class Substance
field :properties, type: Hash, default: {}
field :dataset_ids, type: Array, default: []
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
index 71abad2..034ae3a 100644
--- a/lib/train-test-validation.rb
+++ b/lib/train-test-validation.rb
@@ -2,11 +2,17 @@ module OpenTox
module Validation
+ # Training test set validation
class TrainTest < Validation
field :training_dataset_id, type: BSON::ObjectId
field :test_dataset_id, type: BSON::ObjectId
+ # Create a training test set validation
+ # @param [OpenTox::Model::Lazar]
+ # @param [OpenTox::Dataset] training dataset
+ # @param [OpenTox::Dataset] test dataset
+ # @return [OpenTox::Validation::TrainTest]
def self.create model, training_set, test_set
validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
@@ -32,16 +38,21 @@ module OpenTox
validation
end
+ # Get test dataset
+ # @return [OpenTox::Dataset]
def test_dataset
Dataset.find test_dataset_id
end
+ # Get training dataset
+ # @return [OpenTox::Dataset]
def training_dataset
Dataset.find training_dataset_id
end
end
+ # Training test set validation for classification models
class ClassificationTrainTest < TrainTest
include ClassificationStatistics
field :accept_values, type: Array
@@ -54,6 +65,7 @@ module OpenTox
field :probability_plot_id, type: BSON::ObjectId
end
+ # Training test set validation for regression models
class RegressionTrainTest < TrainTest
include RegressionStatistics
field :rmse, type: Float, default:0
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 2202b79..553e6ac 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -1,7 +1,10 @@
module OpenTox
module Validation
+ # Statistical evaluation of classification validations
module ClassificationStatistics
+ # Get statistics
+ # @return [Hash]
def statistics
self.accept_values = model.prediction_feature.accept_values
self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
@@ -63,6 +66,9 @@ module OpenTox
}
end
+ # Plot accuracy vs prediction probability
+ # @param [String,nil] format
+ # @return [Blob]
def probability_plot format: "pdf"
#unless probability_plot_id
@@ -99,8 +105,11 @@ module OpenTox
end
end
+ # Statistical evaluation of regression validations
module RegressionStatistics
+ # Get statistics
+ # @return [Hash]
def statistics
self.rmse = 0
self.mae = 0
@@ -147,10 +156,15 @@ module OpenTox
}
end
+ # Get percentage of measurements within the prediction interval
+ # @return [Float]
def percent_within_prediction_interval
100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
end
+ # Plot predicted vs measured values
+ # @param [String,nil] format
+ # @return [Blob]
def correlation_plot format: "png"
unless correlation_plot_id
tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
@@ -177,6 +191,11 @@ module OpenTox
$gridfs.find_one(_id: correlation_plot_id).data
end
+ # Get predictions with the largest difference between predicted and measured values
+ # @params [Fixnum] number of predictions
+ # @params [TrueClass,FalseClass,nil] include neighbors
+ # @params [TrueClass,FalseClass,nil] show common descriptors
+ # @return [Hash]
def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
worst_predictions.collect do |p|
diff --git a/lib/validation.rb b/lib/validation.rb
index ced9596..c9954b6 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -2,6 +2,7 @@ module OpenTox
module Validation
+ # Base validation class
class Validation
include OpenTox
include Mongoid::Document
@@ -14,6 +15,8 @@ module OpenTox
field :predictions, type: Hash, default: {}
field :finished_at, type: Time
+ # Get model
+ # @return [OpenTox::Model::Lazar]
def model
Model::Lazar.find model_id
end
diff --git a/test/feature.rb b/test/feature.rb
index 533ac0f..40edb9f 100644
--- a/test/feature.rb
+++ b/test/feature.rb
@@ -55,7 +55,7 @@ class FeatureTest < MiniTest::Test
end
def test_physchem_description
- assert_equal 355, PhysChem.descriptors.size
+ assert_equal 346, PhysChem.descriptors.size
assert_equal 15, PhysChem.openbabel_descriptors.size
assert_equal 295, PhysChem.cdk_descriptors.size
assert_equal 45, PhysChem.joelib_descriptors.size
@@ -63,7 +63,7 @@ class FeatureTest < MiniTest::Test
end
def test_physchem
- assert_equal 355, PhysChem.descriptors.size
+ assert_equal 346, PhysChem.descriptors.size
c = Compound.from_smiles "CC(=O)CC(C)C"
logP = PhysChem.find_or_create_by :name => "Openbabel.logP"
assert_equal 1.6215, logP.calculate(c)
diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb
index 8dc6830..67bbfdd 100644
--- a/test/model-nanoparticle.rb
+++ b/test/model-nanoparticle.rb
@@ -8,6 +8,13 @@ class NanoparticleModelTest < MiniTest::Test
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
end
+ def test_core_coating_source_uris
+ @training_dataset.nanoparticles.each do |np|
+ refute_nil np.core.source
+ np.coating.each{|c| refute_nil c.source}
+ end
+ end
+
def test_nanoparticle_model
assert true, @prediction_feature.measured
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
diff --git a/test/nanomaterial-model-validation.rb b/test/nanomaterial-model-validation.rb
index b91c389..9eaa17d 100644
--- a/test/nanomaterial-model-validation.rb
+++ b/test/nanomaterial-model-validation.rb
@@ -8,7 +8,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
end
def test_default_nanomaterial_validation_model
- validation_model = Model::NanoValidation.create
+ validation_model = Model::Validation.from_enanomapper
[:endpoint,:species,:source].each do |p|
refute_empty validation_model[p]
end
@@ -39,7 +39,7 @@ class NanomaterialValidationModelTest < MiniTest::Test
:prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
:feature_selection => nil
}
- validation_model = Model::NanoValidation.create algorithms: algorithms
+ validation_model = Model::Validation.from_enanomapper algorithms: algorithms
assert validation_model.regression?
refute validation_model.classification?
validation_model.crossvalidations.each do |cv|
@@ -50,6 +50,5 @@ class NanomaterialValidationModelTest < MiniTest::Test
assert_includes nanoparticle.dataset_ids, @training_dataset.id
prediction = validation_model.predict nanoparticle
refute_nil prediction[:value]
- assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
end
end
diff --git a/test/setup.rb b/test/setup.rb
index 63b59fb..40c8ebf 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -6,8 +6,4 @@ include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-unless training_dataset
- Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
-end
-#$mongo.database.drop
-#$gridfs = $mongo.database.fs
+Import::Enanomapper.import unless training_dataset
diff --git a/test/validation-regression.rb b/test/validation-regression.rb
index 7630521..01ed644 100644
--- a/test/validation-regression.rb
+++ b/test/validation-regression.rb
@@ -83,10 +83,9 @@ class ValidationRegressionTest < MiniTest::Test
model = Model::Lazar.create training_dataset: dataset
repeated_cv = RepeatedCrossValidation.create model
repeated_cv.crossvalidations.each do |cv|
- #assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
- #assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
+ assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
+ assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
end
- File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot}
end
end