From cdab5069ded9490afe81095059e9a407faf864d9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 13:44:43 +0100 Subject: independent_variables stored in GridFS to avoid Mongo database size limit problems --- lib/compound.rb | 2 -- lib/model.rb | 19 +++++++++++++++++-- test/setup.rb | 6 +----- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 8a1143b..1c308d8 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -209,7 +209,6 @@ module OpenTox update(:svg_id => $gridfs.insert_one(file)) end $gridfs.find_one(_id: self.svg_id).data - end # Get png image @@ -223,7 +222,6 @@ module OpenTox update(:png_id => $gridfs.insert_one(file)) end Base64.decode64($gridfs.find_one(_id: self.png_id).data) - end # Get all known compound names. Relies on an external service for name lookups. diff --git a/lib/model.rb b/lib/model.rb index 9c4a93f..e5834ae 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -9,6 +9,8 @@ module OpenTox include Mongoid::Timestamps store_in collection: "models" + attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems + field :name, type: String field :creator, type: String, default: __FILE__ field :algorithms, type: Hash, default:{} @@ -17,7 +19,7 @@ module OpenTox field :prediction_feature_id, type: BSON::ObjectId field :dependent_variables, type: Array, default:[] field :descriptor_ids, type:Array, default:[] - field :independent_variables, type: Array, default:[] + field :independent_variables_id, type: BSON::ObjectId field :fingerprints, type: Array, default:[] field :descriptor_weights, type: Array, default:[] field :descriptor_means, type: Array, default:[] @@ -119,6 +121,7 @@ module OpenTox end descriptor_method = model.algorithms[:descriptors][:method] + model.independent_variables = [] case descriptor_method # parse fingerprints when "fingerprint" @@ -179,6 +182,7 @@ module OpenTox def predict_substance substance + @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] @@ -234,7 +238,7 @@ module OpenTox neighbor_dependent_variables << dependent_variables[i] independent_variables.each_with_index do |c,j| neighbor_independent_variables[j] ||= [] - neighbor_independent_variables[j] << independent_variables[j][i] + neighbor_independent_variables[j] << @independent_variables[j][i] end end end @@ -302,6 +306,17 @@ module OpenTox end + def save # store independent_variables in GridFS to avoid Mongo database size limit problems + file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") + self.independent_variables_id = $gridfs.insert_one(file) + super + end + + def independent_variables + @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data + @independent_variables + end + def training_dataset Dataset.find(training_dataset_id) end diff --git a/test/setup.rb b/test/setup.rb index 63b59fb..40c8ebf 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -6,8 +6,4 @@ include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first -unless training_dataset - Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") -end -#$mongo.database.drop -#$gridfs = $mongo.database.fs +Import::Enanomapper.import unless training_dataset -- cgit v1.2.3 From 9636f06938619e7596ec19b65daba5dbe8c212c4 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 15:25:42 +0100 Subject: Brief tutorial in README.md --- README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 431c8b9..6354606 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,73 @@ Installation The output should give you more verbose information that can help in debugging (e.g. to identify missing libraries). +Tutorial +-------- + +Execute the following commands either from an interactive Ruby shell or a Ruby script: + +Create and use `lazar` models for small molecules +................................................. + +#### Create a training dataset + + Create a CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at [Github](https://github.com/opentox/lazar-public-data). + +#### Create and validate a `lazar` model with default algorithms and parameters + + `validated_model = Model::Validation.create_from_csv_file EPAFHM_log10.csv` + +#### Inspect crossvalidation results + + `validated_model.crossvalidations` + +#### Predict a new compound + + Create a compound + + `compound = Compound.from_smiles "NC(=O)OCCC"` + + Predict Fathead Minnow Acute Toxicity + + `validated_model.predict compound` + +#### Experiment with other algorithms + + You can pass algorithms parameters to the `Model::Validation.create_from_csv_file` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. + +Create and use `lazar` nanoparticle models +........................................ + +#### Create and validate a `nano-lazar` model from eNanoMapper with default algorithms and parameters + + `validated_model = Model::Validation.create_from_enanomapper` + + This command will mirror the eNanoMapper database in the local database, create a `nano-lazar` model and validate it with five independent 10-fold crossvalidations. + +#### Inspect crossvalidation results + + `validated_model.crossvalidations` + +#### Predict nanoparticle toxicities + + Choose a random nanoparticle from the "Potein Corona" dataset + ``` + training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + nanoparticle = training_dataset.substances.shuffle.first + ``` + + Predict the "Net Cell Association" endpoint + + `validated_model.predict nanoparticle` + +#### Experiment with other datasets, endpoints and algorithms + + You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. + Documentation ------------- * [API documentation](http://rdoc.info/gems/lazar) Copyright --------- -Copyright (c) 2009-2015 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details. +Copyright (c) 2009-2017 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details. -- cgit v1.2.3 From 5db4a0d44073ddce8c1641a29450d31474b9e831 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 15:27:56 +0100 Subject: README.md headings fixed --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6354606..0cffb15 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,7 @@ Tutorial Execute the following commands either from an interactive Ruby shell or a Ruby script: -Create and use `lazar` models for small molecules -................................................. +### Create and use `lazar` models for small molecules #### Create a training dataset @@ -60,8 +59,7 @@ Create and use `lazar` models for small molecules You can pass algorithms parameters to the `Model::Validation.create_from_csv_file` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. -Create and use `lazar` nanoparticle models -........................................ +### Create and use `lazar` nanoparticle models #### Create and validate a `nano-lazar` model from eNanoMapper with default algorithms and parameters -- cgit v1.2.3 From 093583c8c23b20f18cf82fb15913fe257a3dd72b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 15:31:25 +0100 Subject: README.md CV information fixed --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0cffb15..7e0e420 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s `validated_model = Model::Validation.create_from_csv_file EPAFHM_log10.csv` + This command will create a `lazar` model and validate it with three independent 10-fold crossvalidations. + #### Inspect crossvalidation results `validated_model.crossvalidations` -- cgit v1.2.3 From d4e84b31bff853068f4f1602e3aac3d782558399 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 16:29:02 +0100 Subject: initial model documentation --- lib/model.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lib/model.rb b/lib/model.rb index e5834ae..7731705 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -180,6 +180,9 @@ module OpenTox model end + # Predict a substance + # @param [OpenTox::Substance] + # @return [Hash] def predict_substance substance @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data @@ -260,6 +263,9 @@ module OpenTox prediction end + # Predict a substance (compound or nanoparticle), an array of substances or a dataset + # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] + # @return [Hash, Array, OpenTox::Dataset] def predict object training_dataset = Dataset.find training_dataset_id @@ -345,6 +351,7 @@ module OpenTox class LazarRegression < Lazar end + # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets class Validation include OpenTox @@ -358,6 +365,9 @@ module OpenTox field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId + # Predict a substance (compound or nanoparticle), an array of substances or a dataset + # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] + # @return [Hash, Array, OpenTox::Dataset] def predict object model.predict object end @@ -394,6 +404,10 @@ module OpenTox model.is_a? LazarClassification end + # Create and validate a lazar model from a csv file with training data and a json file with metadata + # + # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at [Github](https://github.com/opentox/lazar-public-data). + # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file -- cgit v1.2.3 From b5d6446f058916d018139948002b6e9d1162d4fe Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 16:56:48 +0100 Subject: model documentation --- lib/model.rb | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/lib/model.rb b/lib/model.rb index 7731705..321636d 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -27,6 +27,11 @@ module OpenTox field :scaled_variables, type: Array, default:[] field :version, type: Hash, default:{} + # Create a lazar model + # @param [OpenTox::Dataset, nil] training_dataset + # @param [OpenTox::Feature, nil] prediction_feature + # @param [Hash] algorithms + # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset prediction_feature = training_dataset.features.first unless prediction_feature @@ -318,23 +323,33 @@ module OpenTox super end + # Get independent variables + # @return [Array] def independent_variables @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data @independent_variables end + # Get training dataset + # @return [OpenTox::Dataset] def training_dataset Dataset.find(training_dataset_id) end + # Get prediction feature + # @return [OpenTox::Feature] def prediction_feature Feature.find(prediction_feature_id) end + # Get training descriptors + # @return [Array] def descriptors descriptor_ids.collect{|id| Feature.find(id)} end + # Get training substances + # @return [Array] def substances substance_ids.collect{|id| Substance.find(id)} end @@ -345,9 +360,11 @@ module OpenTox end + # Classification model class LazarClassification < Lazar end + # Regression model class LazarRegression < Lazar end @@ -372,26 +389,38 @@ module OpenTox model.predict object end + # Get training dataset + # @return [OpenTox::Dataset] def training_dataset model.training_dataset end + # Get lazar model + # @return [OpenTox::Model::Lazar] def model Lazar.find model_id end + # Get algorithms + # @return [Hash] def algorithms model.algorithms end + # Get prediction feature + # @return [OpenTox::Feature] def prediction_feature model.prediction_feature end + # Get repeated crossvalidations + # @return [OpenTox::Validation::RepeatedCrossValidation] def repeated_crossvalidation OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required end + # Get crossvalidations + # @return [Array Date: Wed, 11 Jan 2017 08:24:23 +0100 Subject: model documentation updated --- README.md | 2 +- lib/model.rb | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7e0e420..28ed18f 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s #### Experiment with other datasets, endpoints and algorithms - You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. + You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. Detailed documentation and validation results can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf). Documentation ------------- diff --git a/lib/model.rb b/lib/model.rb index 321636d..64edb76 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,11 +28,14 @@ module OpenTox field :version, type: Hash, default:{} # Create a lazar model - # @param [OpenTox::Dataset, nil] training_dataset + # @param [OpenTox::Dataset] training_dataset # @param [OpenTox::Feature, nil] prediction_feature - # @param [Hash] algorithms + # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature + # @param [Hash, nil] algorithms + # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. + # # @return [OpenTox::Model::Lazar] - def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} + def self.create prediction_feature:nil, training_dataset:, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset prediction_feature = training_dataset.features.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data @@ -185,7 +188,7 @@ module OpenTox model end - # Predict a substance + # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] def predict_substance substance @@ -449,6 +452,7 @@ module OpenTox end # Create and validate a nano-lazar model, import data from eNanoMapper if necessary + # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf # @param [OpenTox::Dataset, nil] training_dataset # @param [OpenTox::Feature, nil] prediction_feature # @param [Hash, nil] algorithms -- cgit v1.2.3 From 04ebe0640ab6e566dfc316f80a020d1e78b10924 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 11 Jan 2017 09:20:40 +0100 Subject: validation documentation --- lib/crossvalidation.rb | 22 ++++++++++++++++++++++ lib/model.rb | 12 ++++++++++-- lib/validation-statistics.rb | 19 +++++++++++++++++++ lib/validation.rb | 3 +++ 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index bcb3ccf..75c5db5 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -1,10 +1,16 @@ module OpenTox module Validation + + # Crossvalidation class CrossValidation < Validation field :validation_ids, type: Array, default: [] field :folds, type: Integer, default: 10 + # Create a crossvalidation + # @param [OpenTox::Model::Lazar] + # @param [Fixnum] number of folds + # @return [OpenTox::Validation::CrossValidation] def self.create model, n=10 $logger.debug model.algorithms klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification @@ -41,14 +47,20 @@ module OpenTox cv end + # Get execution time + # @return [Fixnum] def time finished_at - created_at end + # Get individual validations + # @return [Array] def validations validation_ids.collect{|vid| TrainTest.find vid} end + # Get predictions for all compounds + # @return [Array] def predictions predictions = {} validations.each{|v| predictions.merge!(v.predictions)} @@ -56,6 +68,7 @@ module OpenTox end end + # Crossvalidation of classification models class ClassificationCrossValidation < CrossValidation include ClassificationStatistics field :accept_values, type: Array @@ -68,6 +81,7 @@ module OpenTox field :probability_plot_id, type: BSON::ObjectId end + # Crossvalidation of regression models class RegressionCrossValidation < CrossValidation include RegressionStatistics field :rmse, type: Float, default:0 @@ -78,10 +92,16 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId end + # Independent repeated crossvalidations class RepeatedCrossValidation < Validation field :crossvalidation_ids, type: Array, default: [] field :correlation_plot_id, type: BSON::ObjectId + # Create repeated crossvalidations + # @param [OpenTox::Model::Lazar] + # @param [Fixnum] number of folds + # @param [Fixnum] number of repeats + # @return [OpenTox::Validation::RepeatedCrossValidation] def self.create model, folds=10, repeats=3 repeated_cross_validation = self.new repeats.times do |n| @@ -92,6 +112,8 @@ module OpenTox repeated_cross_validation end + # Get crossvalidations + # @return [OpenTox::Validation::CrossValidation] def crossvalidations crossvalidation_ids.collect{|id| CrossValidation.find(id)} end diff --git a/lib/model.rb b/lib/model.rb index 64edb76..b18610d 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -320,7 +320,9 @@ module OpenTox end - def save # store independent_variables in GridFS to avoid Mongo database size limit problems + # Save the model + # Stores independent_variables in GridFS to avoid Mongo database size limit problems + def save file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") self.independent_variables_id = $gridfs.insert_one(file) super @@ -357,6 +359,8 @@ module OpenTox substance_ids.collect{|id| Substance.find(id)} end + # Are fingerprints used as descriptors + # @return [TrueClass, FalseClass] def fingerprints? algorithms[:descriptors][:method] == "fingerprint" ? true : false end @@ -428,10 +432,14 @@ module OpenTox repeated_crossvalidation.crossvalidations end + # Is it a regression model + # @return [TrueClass, FalseClass] def regression? model.is_a? LazarRegression end + # Is it a classification model + # @return [TrueClass, FalseClass] def classification? model.is_a? LazarClassification end @@ -452,7 +460,7 @@ module OpenTox end # Create and validate a nano-lazar model, import data from eNanoMapper if necessary - # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf + # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf # @param [OpenTox::Dataset, nil] training_dataset # @param [OpenTox::Feature, nil] prediction_feature # @param [Hash, nil] algorithms diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 2202b79..553e6ac 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -1,7 +1,10 @@ module OpenTox module Validation + # Statistical evaluation of classification validations module ClassificationStatistics + # Get statistics + # @return [Hash] def statistics self.accept_values = model.prediction_feature.accept_values self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} @@ -63,6 +66,9 @@ module OpenTox } end + # Plot accuracy vs prediction probability + # @param [String,nil] format + # @return [Blob] def probability_plot format: "pdf" #unless probability_plot_id @@ -99,8 +105,11 @@ module OpenTox end end + # Statistical evaluation of regression validations module RegressionStatistics + # Get statistics + # @return [Hash] def statistics self.rmse = 0 self.mae = 0 @@ -147,10 +156,15 @@ module OpenTox } end + # Get percentage of measurements within the prediction interval + # @return [Float] def percent_within_prediction_interval 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) end + # Plot predicted vs measured values + # @param [String,nil] format + # @return [Blob] def correlation_plot format: "png" unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" @@ -177,6 +191,11 @@ module OpenTox $gridfs.find_one(_id: correlation_plot_id).data end + # Get predictions with the largest difference between predicted and measured values + # @params [Fixnum] number of predictions + # @params [TrueClass,FalseClass,nil] include neighbors + # @params [TrueClass,FalseClass,nil] show common descriptors + # @return [Hash] def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n] worst_predictions.collect do |p| diff --git a/lib/validation.rb b/lib/validation.rb index ced9596..c9954b6 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -2,6 +2,7 @@ module OpenTox module Validation + # Base validation class class Validation include OpenTox include Mongoid::Document @@ -14,6 +15,8 @@ module OpenTox field :predictions, type: Hash, default: {} field :finished_at, type: Time + # Get model + # @return [OpenTox::Model::Lazar] def model Model::Lazar.find model_id end -- cgit v1.2.3 From fa4abf3dfc04abcd8bf21f29f73555a5090bc6eb Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 11 Jan 2017 09:25:33 +0100 Subject: LOO documentation --- lib/leave-one-out-validation.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 538b7b3..8d22018 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -2,8 +2,12 @@ module OpenTox module Validation + # Leave one out validation class LeaveOneOut < Validation + # Create a leave one out validation + # @param [OpenTox::Model::Lazar] + # @return [OpenTox::Validation::LeaveOneOut] def self.create model bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] $logger.debug "#{model.name}: LOO validation started" @@ -32,6 +36,7 @@ module OpenTox end + # Leave one out validation for classification models class ClassificationLeaveOneOut < LeaveOneOut include ClassificationStatistics field :accept_values, type: Array @@ -44,6 +49,7 @@ module OpenTox field :confidence_plot_id, type: BSON::ObjectId end + # Leave one out validation for regression models class RegressionLeaveOneOut < LeaveOneOut include RegressionStatistics field :rmse, type: Float, default: 0 -- cgit v1.2.3 From f522a1089af8775798450b3f9f0aa4b579a3e1b5 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 11 Jan 2017 09:57:36 +0100 Subject: training test set validation documentation --- lib/train-test-validation.rb | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index 71abad2..034ae3a 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -2,11 +2,17 @@ module OpenTox module Validation + # Training test set validation class TrainTest < Validation field :training_dataset_id, type: BSON::ObjectId field :test_dataset_id, type: BSON::ObjectId + # Create a training test set validation + # @param [OpenTox::Model::Lazar] + # @param [OpenTox::Dataset] training dataset + # @param [OpenTox::Dataset] test dataset + # @return [OpenTox::Validation::TrainTest] def self.create model, training_set, test_set validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms @@ -32,16 +38,21 @@ module OpenTox validation end + # Get test dataset + # @return [OpenTox::Dataset] def test_dataset Dataset.find test_dataset_id end + # Get training dataset + # @return [OpenTox::Dataset] def training_dataset Dataset.find training_dataset_id end end + # Training test set validation for classification models class ClassificationTrainTest < TrainTest include ClassificationStatistics field :accept_values, type: Array @@ -54,6 +65,7 @@ module OpenTox field :probability_plot_id, type: BSON::ObjectId end + # Training test set validation for regression models class RegressionTrainTest < TrainTest include RegressionStatistics field :rmse, type: Float, default:0 -- cgit v1.2.3 From 85553b339acf3f9285a1c03b2fff342d9ddb9b6b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 11 Jan 2017 16:00:07 +0100 Subject: documentation for all classes --- lib/algorithm.rb | 1 + lib/caret.rb | 13 +++++- lib/classification.rb | 7 +++- lib/compound.rb | 53 +++++++++++++++---------- lib/dataset.rb | 33 ++++++++++++++-- lib/experiment.rb | 99 ---------------------------------------------- lib/feature.rb | 7 ++++ lib/feature_selection.rb | 5 ++- lib/import.rb | 4 +- lib/nanoparticle.rb | 19 +++++++++ lib/overwrite.rb | 57 ++++++++++++++++++++------ lib/physchem.rb | 28 ++++++++++--- lib/regression.rb | 5 +++ lib/rest-client-wrapper.rb | 1 + lib/similarity.rb | 24 ++++++++++- lib/substance.rb | 1 + 16 files changed, 211 insertions(+), 146 deletions(-) delete mode 100644 lib/experiment.rb diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 0e4b93a..f70ac1a 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -2,6 +2,7 @@ module OpenTox module Algorithm + # Execute an algorithm with parameters def self.run algorithm, parameters=nil klass,method = algorithm.split('.') Object.const_get(klass).send(method,parameters) diff --git a/lib/caret.rb b/lib/caret.rb index 7e4f771..f5c2bde 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -1,9 +1,17 @@ module OpenTox module Algorithm + # Ruby interface for the R caret package + # Caret model list: https://topepo.github.io/caret/modelList.html class Caret - # model list: https://topepo.github.io/caret/modelList.html + # Create a local R caret model and make a prediction + # @param [Array] dependent_variables + # @param [Array>] independent_variables + # @param [Array] weights + # @param [String] Caret method + # @param [Array] query_variables + # @return [Hash] def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables: remove = [] # remove independent_variables with single values @@ -77,12 +85,13 @@ module OpenTox end - # call caret methods dynamically, e.g. Caret.pls + # Call caret methods dynamically, e.g. Caret.pls def self.method_missing(sym, *args, &block) args.first[:method] = sym.to_s self.create_model_and_predict args.first end + # Convert Ruby values to R values def self.to_r v return "F" if v == false return "T" if v == true diff --git a/lib/classification.rb b/lib/classification.rb index e8c179f..638492b 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -1,9 +1,14 @@ module OpenTox module Algorithm + # Classification algorithms class Classification - def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables: + # Weighted majority vote + # @param [Array] dependent_variables + # @param [Array] weights + # @return [Hash] + def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:nil class_weights = {} dependent_variables.each_with_index do |v,i| class_weights[v] ||= [] diff --git a/lib/compound.rb b/lib/compound.rb index 1c308d8..bfe69e3 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -2,6 +2,7 @@ CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/" module OpenTox + # Small molecules with defined chemical structures class Compound < Substance require_relative "unique_descriptors.rb" DEFAULT_FINGERPRINT = "MP2D" @@ -28,6 +29,9 @@ module OpenTox compound end + # Create chemical fingerprint + # @param [String] fingerprint type + # @return [Array] def fingerprint type=DEFAULT_FINGERPRINT unless fingerprints[type] return [] unless self.smiles @@ -75,6 +79,9 @@ module OpenTox fingerprints[type] end + # Calculate physchem properties + # @param [Array] list of descriptors + # @return [Array] def calculate_properties descriptors=PhysChem::OPENBABEL calculated_ids = properties.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. @@ -96,6 +103,10 @@ module OpenTox descriptors.collect{|d| properties[d.id.to_s]} end + # Match a SMARTS substructure + # @param [String] smarts + # @param [TrueClass,FalseClass] count matches or return true/false + # @return [TrueClass,FalseClass,Fixnum] def smarts_match smarts, count=false obconversion = OpenBabel::OBConversion.new obmol = OpenBabel::OBMol.new @@ -116,8 +127,8 @@ module OpenTox # Create a compound from smiles string # @example # compound = OpenTox::Compound.from_smiles("c1ccccc1") - # @param [String] smiles Smiles string - # @return [OpenTox::Compound] Compound + # @param [String] smiles + # @return [OpenTox::Compound] def self.from_smiles smiles if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces." @@ -132,9 +143,9 @@ module OpenTox end end - # Create a compound from inchi string - # @param inchi [String] smiles InChI string - # @return [OpenTox::Compound] Compound + # Create a compound from InChI string + # @param [String] InChI + # @return [OpenTox::Compound] def self.from_inchi inchi #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") @@ -145,9 +156,9 @@ module OpenTox end end - # Create a compound from sdf string - # @param sdf [String] smiles SDF string - # @return [OpenTox::Compound] Compound + # Create a compound from SDF + # @param [String] SDF + # @return [OpenTox::Compound] def self.from_sdf sdf # do not store sdf because it might be 2D Compound.from_smiles obconversion(sdf,"sdf","can") @@ -156,40 +167,38 @@ module OpenTox # Create a compound from name. Relies on an external service for name lookups. # @example # compound = OpenTox::Compound.from_name("Benzene") - # @param name [String] can be also an InChI/InChiKey, CAS number, etc - # @return [OpenTox::Compound] Compound + # @param [String] name, can be also an InChI/InChiKey, CAS number, etc + # @return [OpenTox::Compound] def self.from_name name Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles")) end # Get InChI - # @return [String] InChI string + # @return [String] def inchi unless self["inchi"] - result = obconversion(smiles,"smi","inchi") - #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp update(:inchi => result.chomp) if result and !result.empty? end self["inchi"] end # Get InChIKey - # @return [String] InChIKey string + # @return [String] def inchikey update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"] self["inchikey"] end # Get (canonical) smiles - # @return [String] Smiles string + # @return [String] def smiles update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"] self["smiles"] end - # Get sdf - # @return [String] SDF string + # Get SDF + # @return [String] def sdf if self.sdf_id.nil? sdf = obconversion(smiles,"smi","sdf") @@ -227,20 +236,22 @@ module OpenTox # Get all known compound names. Relies on an external service for name lookups. # @example # names = compound.names - # @return [String] Compound names + # @return [Array] def names update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"] self["names"] end - # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem + # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem + # @return [String] def cid pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] self["cid"] end - # @return [String] ChEMBL database compound id, derieved via restcall to chembl + # Get ChEMBL database compound id, obtained via REST call to ChEMBL + # @return [String] def chemblid # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" @@ -290,7 +301,7 @@ module OpenTox mg.to_f/molecular_weight end - # Calculate molecular weight of Compound with OB and store it in object + # Calculate molecular weight of Compound with OB and store it in compound object # @return [Float] molecular weight def molecular_weight mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") diff --git a/lib/dataset.rb b/lib/dataset.rb index ab55294..44690e1 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -3,32 +3,43 @@ require 'tempfile' module OpenTox + # Collection of substances and features class Dataset field :data_entries, type: Hash, default: {} # Readers + # Get all compounds + # @return [Array] def compounds substances.select{|s| s.is_a? Compound} end + # Get all nanoparticles + # @return [Array] def nanoparticles substances.select{|s| s.is_a? Nanoparticle} end # Get all substances + # @return [Array] def substances @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq @substances end # Get all features + # @return [Array] def features @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq @features end + # Get all values for a given substance and feature + # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id + # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id + # @return [TrueClass,FalseClass,Float] def values substance,feature substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature @@ -41,6 +52,10 @@ module OpenTox # Writers + # Add a value for a given substance and feature + # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id + # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id + # @param [TrueClass,FalseClass,Float] def add(substance,feature,value) substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature @@ -87,7 +102,7 @@ module OpenTox # Serialisation - # converts dataset to csv format including compound smiles as first column, other column headers are feature names + # Convert dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| @@ -130,6 +145,9 @@ module OpenTox #end # Create a dataset from CSV file + # @param [File] + # @param [TrueClass,FalseClass] accept or reject empty values + # @return [OpenTox::Dataset] def self.from_csv_file file, accept_empty_values=false source = file name = File.basename(file,".*") @@ -145,8 +163,10 @@ module OpenTox dataset end - # parse data in tabular format (e.g. from csv) - # does a lot of guesswork in order to determine feature types + # Parse data in tabular format (e.g. from csv) + # does a lot of guesswork in order to determine feature types + # @param [Array] + # @param [TrueClass,FalseClass] accept or reject empty values def parse_table table, accept_empty_values # features @@ -225,6 +245,7 @@ module OpenTox save end + # Delete dataset def delete compounds.each{|c| c.dataset_ids.delete id.to_s} super @@ -238,14 +259,20 @@ module OpenTox field :prediction_feature_id, type: BSON::ObjectId field :predictions, type: Hash, default: {} + # Get prediction feature + # @return [OpenTox::Feature] def prediction_feature Feature.find prediction_feature_id end + # Get all compounds + # @return [Array] def compounds substances.select{|s| s.is_a? Compound} end + # Get all substances + # @return [Array] def substances predictions.keys.collect{|id| Substance.find id} end diff --git a/lib/experiment.rb b/lib/experiment.rb deleted file mode 100644 index 0dfdf86..0000000 --- a/lib/experiment.rb +++ /dev/null @@ -1,99 +0,0 @@ -module OpenTox - - class Experiment - field :dataset_ids, type: Array - field :model_settings, type: Array, default: [] - field :results, type: Hash, default: {} - - def run - dataset_ids.each do |dataset_id| - dataset = Dataset.find(dataset_id) - results[dataset_id.to_s] = [] - model_settings.each do |setting| - setting = setting.dup - model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm] - model = Object.const_get(model_algorithm).create dataset, setting - $logger.debug model - model.save - repeated_crossvalidation = RepeatedCrossValidation.create model - results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} - end - end - save - end - - def report - # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/ - report = {} - report[:name] = name - report[:experiment_id] = self.id.to_s - report[:results] = {} - parameters = [] - dataset_ids.each do |dataset_id| - dataset_name = Dataset.find(dataset_id).name - report[:results][dataset_name] = {} - report[:results][dataset_name][:anova] = {} - report[:results][dataset_name][:data] = [] - # TODO results[dataset_id.to_s] does not exist - results[dataset_id.to_s].each do |result| - model = Model::Lazar.find(result[:model_id]) - repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) - crossvalidations = repeated_cv.crossvalidations - if crossvalidations.first.is_a? ClassificationCrossValidation - parameters = [:accuracy,:true_rate,:predictivity] - elsif crossvalidations.first.is_a? RegressionCrossValidation - parameters = [:rmse,:mae,:r_squared] - end - summary = {} - [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| - summary[key] = model[key] - end - summary[:nr_instances] = crossvalidations.first.nr_instances - summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} - summary[:time] = crossvalidations.collect{|cv| cv.time} - parameters.each do |param| - summary[param] = crossvalidations.collect{|cv| cv.send(param)} - end - report[:results][dataset_name][:data] << summary - end - end - report[:results].each do |dataset,results| - ([:time,:nr_unpredicted]+parameters).each do |param| - experiments = [] - outcome = [] - results[:data].each_with_index do |result,i| - result[param].each do |p| - experiments << i - p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0 - outcome << p - end - end - begin - R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"} - R.eval "experiment_nr = factor(experiment_nr)" - R.assign "outcome", outcome - R.eval "data = data.frame(experiment_nr,outcome)" - # one-way ANOVA - R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')" - # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov - p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby - # aequivalent - # sum = R.eval("summary(fit)") - #p_value = sum.to_ruby.first.last.first - rescue - p_value = nil - end - report[:results][dataset][:anova][param] = p_value -=begin -=end - end - end - report - end - - def summary - report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}} - end - end - -end diff --git a/lib/feature.rb b/lib/feature.rb index 0ca4d41..f811aef 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -8,10 +8,14 @@ module OpenTox field :unit, type: String field :conditions, type: Hash + # Is it a nominal feature + # @return [TrueClass,FalseClass] def nominal? self.class == NominalFeature end + # Is it a numeric feature + # @return [TrueClass,FalseClass] def numeric? self.class == NumericFeature end @@ -30,6 +34,9 @@ module OpenTox class Smarts < NominalFeature field :smarts, type: String index "smarts" => 1 + # Create feature from SMARTS string + # @param [String] + # @return [OpenTox::Feature] def self.from_smarts smarts self.find_or_create_by :smarts => smarts end diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb index 65f9752..c596b1f 100644 --- a/lib/feature_selection.rb +++ b/lib/feature_selection.rb @@ -1,13 +1,16 @@ module OpenTox module Algorithm + # Feature selection algorithms class FeatureSelection + # Select features correlated to the models prediction feature + # @param [OpenTox::Model::Lazar] def self.correlation_filter model relevant_features = {} R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)} model.descriptor_weights = [] - selected_variables = [] + selected_variables = [] selected_descriptor_ids = [] model.independent_variables.each_with_index do |v,i| v.collect!{|n| to_r(n)} diff --git a/lib/import.rb b/lib/import.rb index 7a68335..fd00fbe 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -1,12 +1,14 @@ module OpenTox + # Import data from external databases module Import class Enanomapper include OpenTox - # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) + # Import from eNanoMapper def self.import + # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) datasets = {} bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] bundles.each do |bundle| diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 06db4d2..73d5f8b 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -1,25 +1,36 @@ module OpenTox + # Nanoparticles class Nanoparticle < Substance include OpenTox field :core_id, type: String, default: nil field :coating_ids, type: Array, default: [] + # Get core compound + # @return [OpenTox::Compound] def core Compound.find core_id end + # Get coatings + # @return [Array] def coating coating_ids.collect{|i| Compound.find i } end + # Get nanoparticle fingerprint (union of core and coating fingerprints) + # @param [String] fingerprint type + # @return [Array] def fingerprint type=DEFAULT_FINGERPRINT core_fp = core.fingerprint type coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact end + # Calculate physchem properties + # @param [Array] list of descriptors + # @return [Array] def calculate_properties descriptors=PhysChem::OPENBABEL if core.smiles and !coating.collect{|c| c.smiles}.compact.empty? core_prop = core.calculate_properties descriptors @@ -28,6 +39,10 @@ module OpenTox end end + # Add (measured) feature values + # @param [OpenTox::Feature] + # @param [TrueClass,FalseClass,Float] + # @param [OpenTox::Dataset] def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand case feature.category @@ -55,6 +70,10 @@ module OpenTox end end + # Parse values from Ambit database + # @param [OpenTox::Feature] + # @param [TrueClass,FalseClass,Float] + # @param [OpenTox::Dataset] def parse_ambit_value feature, v, dataset # TODO add study id to warnings v.delete "unit" diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 31d30c9..91bc9e1 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -2,41 +2,51 @@ require "base64" class Object # An object is blank if it's false, empty, or a whitespace string. # For example, "", " ", +nil+, [], and {} are all blank. + # @return [TrueClass,FalseClass] def blank? respond_to?(:empty?) ? empty? : !self end + # Is it a numeric object + # @return [TrueClass,FalseClass] def numeric? true if Float(self) rescue false end # Returns dimension of nested arrays + # @return [Fixnum] def dimension self.class == Array ? 1 + self[0].dimension : 0 end end class Numeric + # Convert number to percent + # @return [Float] def percent_of(n) self.to_f / n.to_f * 100.0 end end class Float - # round to n significant digits - # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby + # Round to n significant digits + # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby + # @param [Fixnum] + # @return [Float] def signif(n) Float("%.#{n}g" % self) end - # converts -10 logarithmized values back + # Convert -10 log values to original values + # @return [Float] def delog10 10**(-1*self) end end module Enumerable - # @return [Array] only the duplicates of an enumerable + # Get duplicates + # @return [Array] def duplicates inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys end @@ -51,7 +61,10 @@ module Enumerable end class String - # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model) + # Convert camel-case to underscore-case + # @example + # OpenTox::SuperModel -> open_tox/super_model + # @return [String] def underscore self.gsub(/::/, '/'). gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2'). @@ -60,7 +73,7 @@ class String downcase end - # convert strings to boolean values + # Convert strings to boolean values # @return [TrueClass,FalseClass] true or false def to_boolean return true if self == true || self =~ (/(true|t|yes|y|1)$/i) @@ -71,7 +84,8 @@ class String end class File - # @return [String] mime_type including charset using linux cmd command + # Get mime_type including charset using linux file command + # @return [String] def mime_type `file -ib '#{self.path}'`.chomp end @@ -79,7 +93,7 @@ end class Array - # Sum up the size of single arrays in an array of arrays + # Sum the size of single arrays in an array of arrays # @param [Array] Array of arrays # @return [Integer] Sum of size of array elements def sum_size @@ -92,33 +106,43 @@ class Array } end - # For symbolic features + # Check if the array has just one unique value. # @param [Array] Array to test. - # @return [Boolean] Whether the array has just one unique value. + # @return [TrueClass,FalseClass] def zero_variance? return self.uniq.size == 1 end + # Get the median of an array + # @return [Numeric] def median sorted = self.sort len = sorted.length (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0 end + # Get the mean of an array + # @return [Numeric] def mean self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size end + # Get the variance of an array + # @return [Numeric] def sample_variance m = self.mean sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 } sum/(self.compact.length - 1).to_f end + # Get the standard deviation of an array + # @return [Numeric] def standard_deviation Math.sqrt(self.sample_variance) end + # Convert array values for R + # @return [Array] def for_R if self.first.is_a?(String) #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets @@ -128,6 +152,8 @@ class Array end end + # Collect array with index + # in analogy to each_with_index def collect_with_index result = [] self.each_with_index do |elt, idx| @@ -139,11 +165,15 @@ end module URI + # Is it a https connection + # @param [String] + # @return [TrueClass,FalseClass] def self.ssl? uri URI.parse(uri).instance_of? URI::HTTPS end - # @return [Boolean] checks if resource exists by making a HEAD-request + # Check if a http resource exists by making a HEAD-request + # @return [TrueClass,FalseClass] def self.accessible?(uri) parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : "")) http_code = URI.task?(uri) ? 600 : 400 @@ -163,6 +193,9 @@ module URI false end + # Is the URI valid + # @param [String] + # @return [TrueClass,FalseClass] def self.valid? uri u = URI.parse(uri) u.scheme!=nil and u.host!=nil @@ -170,6 +203,8 @@ module URI false end + # Is the URI a task URI + # @param [String] def self.task? uri uri =~ /task/ and URI.valid? uri end diff --git a/lib/physchem.rb b/lib/physchem.rb index 327acd8..07df867 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -39,6 +39,9 @@ module OpenTox require_relative "unique_descriptors.rb" + # Get descriptor features + # @param [Hash] + # @return [Array] def self.descriptors desc=DESCRIPTORS desc.collect do |name,description| lib,desc = name.split('.',2) @@ -46,6 +49,8 @@ module OpenTox end end + # Get unique descriptor features + # @return [Array] def self.unique_descriptors udesc = [] UNIQUEDESCRIPTORS.each do |name| @@ -64,23 +69,28 @@ module OpenTox udesc end + # Get OpenBabel descriptor features + # @return [Array] def self.openbabel_descriptors descriptors OPENBABEL end + # Get CDK descriptor features + # @return [Array] def self.cdk_descriptors descriptors CDK end + # Get JOELIB descriptor features + # @return [Array] def self.joelib_descriptors descriptors JOELIB end - def calculate compound - result = send library.downcase,descriptor,compound - result[self.name] - end - + # Calculate OpenBabel descriptors + # @param [String] descriptor type + # @param [OpenTox::Compound] + # @return [Hash] def openbabel descriptor, compound obdescriptor = OpenBabel::OBDescriptor.find_type descriptor obmol = OpenBabel::OBMol.new @@ -90,10 +100,18 @@ module OpenTox {"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))} end + # Calculate CDK descriptors + # @param [String] descriptor type + # @param [OpenTox::Compound] + # @return [Hash] def cdk descriptor, compound java_descriptor "cdk", descriptor, compound end + # Calculate JOELIB descriptors + # @param [String] descriptor type + # @param [OpenTox::Compound] + # @return [Hash] def joelib descriptor, compound java_descriptor "joelib", descriptor, compound end diff --git a/lib/regression.rb b/lib/regression.rb index 3890987..fd2855f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -1,8 +1,13 @@ module OpenTox module Algorithm + # Regression algorithms class Regression + # Weighted average + # @param [Array] dependent_variables + # @param [Array] weights + # @return [Hash] def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil # TODO: prediction_interval weighted_sum = 0.0 diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb index 2073be2..f76a296 100644 --- a/lib/rest-client-wrapper.rb +++ b/lib/rest-client-wrapper.rb @@ -1,5 +1,6 @@ module OpenTox + # Adjustments to the rest-client gem for OpenTox class RestClientWrapper attr_accessor :request, :response diff --git a/lib/similarity.rb b/lib/similarity.rb index 0901936..ccbc9d6 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -2,6 +2,10 @@ module OpenTox module Algorithm class Vector + # Get dot product + # @param [Vector] + # @param [Vector] + # @return [Numeric] def self.dot_product(a, b) products = a.zip(b).map{|a, b| a * b} products.inject(0) {|s,p| s + p} @@ -15,6 +19,9 @@ module OpenTox class Similarity + # Get Tanimoto similarity + # @param [Array>] + # @return [Float] def self.tanimoto fingerprints ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end @@ -23,18 +30,28 @@ module OpenTox #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f #end + # Get Euclidean distance + # @param [Array>] + # @return [Float] def self.euclid scaled_properties sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) end - # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + # Get cosine similarity + # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + # @param [Array>] + # @return [Float] def self.cosine scaled_properties scaled_properties = remove_nils scaled_properties Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1])) end - def self.weighted_cosine scaled_properties # [a,b,weights] + # Get weighted cosine similarity + # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + # @param [Array>] [a,b,weights] + # @return [Float] + def self.weighted_cosine scaled_properties a,b,w = remove_nils scaled_properties return cosine(scaled_properties) if w.uniq.size == 1 dot_product = 0 @@ -48,6 +65,9 @@ module OpenTox dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) end + # Remove nil values + # @param [Array>] [a,b,weights] + # @return [Array>] [a,b,weights] def self.remove_nils scaled_properties a =[]; b = []; w = [] (0..scaled_properties.first.size-1).each do |i| diff --git a/lib/substance.rb b/lib/substance.rb index 31c465e..ef49659 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,5 +1,6 @@ module OpenTox + # Base class for substances (e.g. compunds, nanoparticles) class Substance field :properties, type: Hash, default: {} field :dataset_ids, type: Array, default: [] -- cgit v1.2.3 From d7504cc422bbaeee3546589d87e7baeb4e977c0b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 12 Jan 2017 17:57:03 +0100 Subject: source uris for core and coating --- lib/import.rb | 2 ++ test/feature.rb | 4 ++-- test/model-nanoparticle.rb | 7 +++++++ test/nanomaterial-model-validation.rb | 5 ++--- test/validation-regression.rb | 5 ++--- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/lib/import.rb b/lib/import.rb index fd00fbe..96e7ad1 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -22,6 +22,7 @@ module OpenTox uri = c["component"]["compound"]["URI"] uri = CGI.escape File.join(uri,"&media=application/json") data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") + source = data["dataEntry"][0]["compound"]["URI"] smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] names = [] names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] @@ -33,6 +34,7 @@ module OpenTox else compound = Compound.find_or_create_by(:name => names.first,:names => names.compact) end + compound.source = source compound.save if c["relation"] == "HAS_CORE" core_id = compound.id.to_s diff --git a/test/feature.rb b/test/feature.rb index 533ac0f..40edb9f 100644 --- a/test/feature.rb +++ b/test/feature.rb @@ -55,7 +55,7 @@ class FeatureTest < MiniTest::Test end def test_physchem_description - assert_equal 355, PhysChem.descriptors.size + assert_equal 346, PhysChem.descriptors.size assert_equal 15, PhysChem.openbabel_descriptors.size assert_equal 295, PhysChem.cdk_descriptors.size assert_equal 45, PhysChem.joelib_descriptors.size @@ -63,7 +63,7 @@ class FeatureTest < MiniTest::Test end def test_physchem - assert_equal 355, PhysChem.descriptors.size + assert_equal 346, PhysChem.descriptors.size c = Compound.from_smiles "CC(=O)CC(C)C" logP = PhysChem.find_or_create_by :name => "Openbabel.logP" assert_equal 1.6215, logP.calculate(c) diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb index 8dc6830..67bbfdd 100644 --- a/test/model-nanoparticle.rb +++ b/test/model-nanoparticle.rb @@ -8,6 +8,13 @@ class NanoparticleModelTest < MiniTest::Test @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first end + def test_core_coating_source_uris + @training_dataset.nanoparticles.each do |np| + refute_nil np.core.source + np.coating.each{|c| refute_nil c.source} + end + end + def test_nanoparticle_model assert true, @prediction_feature.measured model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature diff --git a/test/nanomaterial-model-validation.rb b/test/nanomaterial-model-validation.rb index b91c389..9eaa17d 100644 --- a/test/nanomaterial-model-validation.rb +++ b/test/nanomaterial-model-validation.rb @@ -8,7 +8,7 @@ class NanomaterialValidationModelTest < MiniTest::Test end def test_default_nanomaterial_validation_model - validation_model = Model::NanoValidation.create + validation_model = Model::Validation.from_enanomapper [:endpoint,:species,:source].each do |p| refute_empty validation_model[p] end @@ -39,7 +39,7 @@ class NanomaterialValidationModelTest < MiniTest::Test :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, :feature_selection => nil } - validation_model = Model::NanoValidation.create algorithms: algorithms + validation_model = Model::Validation.from_enanomapper algorithms: algorithms assert validation_model.regression? refute validation_model.classification? validation_model.crossvalidations.each do |cv| @@ -50,6 +50,5 @@ class NanomaterialValidationModelTest < MiniTest::Test assert_includes nanoparticle.dataset_ids, @training_dataset.id prediction = validation_model.predict nanoparticle refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." end end diff --git a/test/validation-regression.rb b/test/validation-regression.rb index 7630521..01ed644 100644 --- a/test/validation-regression.rb +++ b/test/validation-regression.rb @@ -83,10 +83,9 @@ class ValidationRegressionTest < MiniTest::Test model = Model::Lazar.create training_dataset: dataset repeated_cv = RepeatedCrossValidation.create model repeated_cv.crossvalidations.each do |cv| - #assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034" - #assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" + assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034" + assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" end - File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot} end end -- cgit v1.2.3