From cdab5069ded9490afe81095059e9a407faf864d9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 13:44:43 +0100 Subject: independent_variables stored in GridFS to avoid Mongo database size limit problems --- lib/model.rb | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 9c4a93f..e5834ae 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -9,6 +9,8 @@ module OpenTox include Mongoid::Timestamps store_in collection: "models" + attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems + field :name, type: String field :creator, type: String, default: __FILE__ field :algorithms, type: Hash, default:{} @@ -17,7 +19,7 @@ module OpenTox field :prediction_feature_id, type: BSON::ObjectId field :dependent_variables, type: Array, default:[] field :descriptor_ids, type:Array, default:[] - field :independent_variables, type: Array, default:[] + field :independent_variables_id, type: BSON::ObjectId field :fingerprints, type: Array, default:[] field :descriptor_weights, type: Array, default:[] field :descriptor_means, type: Array, default:[] @@ -119,6 +121,7 @@ module OpenTox end descriptor_method = model.algorithms[:descriptors][:method] + model.independent_variables = [] case descriptor_method # parse fingerprints when "fingerprint" @@ -179,6 +182,7 @@ module OpenTox def predict_substance substance + @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] @@ -234,7 +238,7 @@ module OpenTox neighbor_dependent_variables << dependent_variables[i] independent_variables.each_with_index do |c,j| neighbor_independent_variables[j] ||= [] - neighbor_independent_variables[j] << independent_variables[j][i] + neighbor_independent_variables[j] << @independent_variables[j][i] end end end @@ -302,6 +306,17 @@ module OpenTox end + def save # store independent_variables in GridFS to avoid Mongo database size limit problems + file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") + self.independent_variables_id = $gridfs.insert_one(file) + super + end + + def independent_variables + @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data + @independent_variables + end + def training_dataset Dataset.find(training_dataset_id) end -- cgit v1.2.3 From d4e84b31bff853068f4f1602e3aac3d782558399 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 16:29:02 +0100 Subject: initial model documentation --- lib/model.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index e5834ae..7731705 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -180,6 +180,9 @@ module OpenTox model end + # Predict a substance + # @param [OpenTox::Substance] + # @return [Hash] def predict_substance substance @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data @@ -260,6 +263,9 @@ module OpenTox prediction end + # Predict a substance (compound or nanoparticle), an array of substances or a dataset + # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] + # @return [Hash, Array, OpenTox::Dataset] def predict object training_dataset = Dataset.find training_dataset_id @@ -345,6 +351,7 @@ module OpenTox class LazarRegression < Lazar end + # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets class Validation include OpenTox @@ -358,6 +365,9 @@ module OpenTox field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId + # Predict a substance (compound or nanoparticle), an array of substances or a dataset + # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] + # @return [Hash, Array, OpenTox::Dataset] def predict object model.predict object end @@ -394,6 +404,10 @@ module OpenTox model.is_a? LazarClassification end + # Create and validate a lazar model from a csv file with training data and a json file with metadata + # + # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at [Github](https://github.com/opentox/lazar-public-data). + # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file -- cgit v1.2.3 From b5d6446f058916d018139948002b6e9d1162d4fe Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 10 Jan 2017 16:56:48 +0100 Subject: model documentation --- lib/model.rb | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 7731705..321636d 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -27,6 +27,11 @@ module OpenTox field :scaled_variables, type: Array, default:[] field :version, type: Hash, default:{} + # Create a lazar model + # @param [OpenTox::Dataset, nil] training_dataset + # @param [OpenTox::Feature, nil] prediction_feature + # @param [Hash] algorithms + # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset prediction_feature = training_dataset.features.first unless prediction_feature @@ -318,23 +323,33 @@ module OpenTox super end + # Get independent variables + # @return [Array] def independent_variables @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data @independent_variables end + # Get training dataset + # @return [OpenTox::Dataset] def training_dataset Dataset.find(training_dataset_id) end + # Get prediction feature + # @return [OpenTox::Feature] def prediction_feature Feature.find(prediction_feature_id) end + # Get training descriptors + # @return [Array] def descriptors descriptor_ids.collect{|id| Feature.find(id)} end + # Get training substances + # @return [Array] def substances substance_ids.collect{|id| Substance.find(id)} end @@ -345,9 +360,11 @@ module OpenTox end + # Classification model class LazarClassification < Lazar end + # Regression model class LazarRegression < Lazar end @@ -372,26 +389,38 @@ module OpenTox model.predict object end + # Get training dataset + # @return [OpenTox::Dataset] def training_dataset model.training_dataset end + # Get lazar model + # @return [OpenTox::Model::Lazar] def model Lazar.find model_id end + # Get algorithms + # @return [Hash] def algorithms model.algorithms end + # Get prediction feature + # @return [OpenTox::Feature] def prediction_feature model.prediction_feature end + # Get repeated crossvalidations + # @return [OpenTox::Validation::RepeatedCrossValidation] def repeated_crossvalidation OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required end + # Get crossvalidations + # @return [Array Date: Wed, 11 Jan 2017 08:24:23 +0100 Subject: model documentation updated --- lib/model.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 321636d..64edb76 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,11 +28,14 @@ module OpenTox field :version, type: Hash, default:{} # Create a lazar model - # @param [OpenTox::Dataset, nil] training_dataset + # @param [OpenTox::Dataset] training_dataset # @param [OpenTox::Feature, nil] prediction_feature - # @param [Hash] algorithms + # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature + # @param [Hash, nil] algorithms + # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. + # # @return [OpenTox::Model::Lazar] - def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} + def self.create prediction_feature:nil, training_dataset:, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset prediction_feature = training_dataset.features.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data @@ -185,7 +188,7 @@ module OpenTox model end - # Predict a substance + # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] def predict_substance substance @@ -449,6 +452,7 @@ module OpenTox end # Create and validate a nano-lazar model, import data from eNanoMapper if necessary + # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf # @param [OpenTox::Dataset, nil] training_dataset # @param [OpenTox::Feature, nil] prediction_feature # @param [Hash, nil] algorithms -- cgit v1.2.3 From 04ebe0640ab6e566dfc316f80a020d1e78b10924 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 11 Jan 2017 09:20:40 +0100 Subject: validation documentation --- lib/model.rb | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 64edb76..b18610d 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -320,7 +320,9 @@ module OpenTox end - def save # store independent_variables in GridFS to avoid Mongo database size limit problems + # Save the model + # Stores independent_variables in GridFS to avoid Mongo database size limit problems + def save file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") self.independent_variables_id = $gridfs.insert_one(file) super @@ -357,6 +359,8 @@ module OpenTox substance_ids.collect{|id| Substance.find(id)} end + # Are fingerprints used as descriptors + # @return [TrueClass, FalseClass] def fingerprints? algorithms[:descriptors][:method] == "fingerprint" ? true : false end @@ -428,10 +432,14 @@ module OpenTox repeated_crossvalidation.crossvalidations end + # Is it a regression model + # @return [TrueClass, FalseClass] def regression? model.is_a? LazarRegression end + # Is it a classification model + # @return [TrueClass, FalseClass] def classification? model.is_a? LazarClassification end @@ -452,7 +460,7 @@ module OpenTox end # Create and validate a nano-lazar model, import data from eNanoMapper if necessary - # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf + # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf # @param [OpenTox::Dataset, nil] training_dataset # @param [OpenTox::Feature, nil] prediction_feature # @param [Hash, nil] algorithms -- cgit v1.2.3