summaryrefslogtreecommitdiff
path: root/lib/model.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/model.rb')
-rw-r--r--lib/model.rb80
1 files changed, 77 insertions, 3 deletions
diff --git a/lib/model.rb b/lib/model.rb
index 9c4a93f..b18610d 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -9,6 +9,8 @@ module OpenTox
include Mongoid::Timestamps
store_in collection: "models"
+ attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems
+
field :name, type: String
field :creator, type: String, default: __FILE__
field :algorithms, type: Hash, default:{}
@@ -17,7 +19,7 @@ module OpenTox
field :prediction_feature_id, type: BSON::ObjectId
field :dependent_variables, type: Array, default:[]
field :descriptor_ids, type:Array, default:[]
- field :independent_variables, type: Array, default:[]
+ field :independent_variables_id, type: BSON::ObjectId
field :fingerprints, type: Array, default:[]
field :descriptor_weights, type: Array, default:[]
field :descriptor_means, type: Array, default:[]
@@ -25,7 +27,15 @@ module OpenTox
field :scaled_variables, type: Array, default:[]
field :version, type: Hash, default:{}
- def self.create prediction_feature:nil, training_dataset:nil, algorithms:{}
+ # Create a lazar model
+ # @param [OpenTox::Dataset] training_dataset
+ # @param [OpenTox::Feature, nil] prediction_feature
+ # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
+ # @param [Hash, nil] algorithms
+ # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
+ #
+ # @return [OpenTox::Model::Lazar]
+ def self.create prediction_feature:nil, training_dataset:, algorithms:{}
bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
prediction_feature = training_dataset.features.first unless prediction_feature
# TODO: prediction_feature without training_dataset: use all available data
@@ -119,6 +129,7 @@ module OpenTox
end
descriptor_method = model.algorithms[:descriptors][:method]
+ model.independent_variables = []
case descriptor_method
# parse fingerprints
when "fingerprint"
@@ -177,8 +188,12 @@ module OpenTox
model
end
+ # Predict a substance (compound or nanoparticle)
+ # @param [OpenTox::Substance]
+ # @return [Hash]
def predict_substance substance
+ @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
when /tanimoto/ # binary features
similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
@@ -234,7 +249,7 @@ module OpenTox
neighbor_dependent_variables << dependent_variables[i]
independent_variables.each_with_index do |c,j|
neighbor_independent_variables[j] ||= []
- neighbor_independent_variables[j] << independent_variables[j][i]
+ neighbor_independent_variables[j] << @independent_variables[j][i]
end
end
end
@@ -256,6 +271,9 @@ module OpenTox
prediction
end
+ # Predict a substance (compound or nanoparticle), an array of substances or a dataset
+ # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
+ # @return [Hash, Array<Hash>, OpenTox::Dataset]
def predict object
training_dataset = Dataset.find training_dataset_id
@@ -302,34 +320,62 @@ module OpenTox
end
+ # Save the model
+ # Stores independent_variables in GridFS to avoid Mongo database size limit problems
+ def save
+ file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables")
+ self.independent_variables_id = $gridfs.insert_one(file)
+ super
+ end
+
+ # Get independent variables
+ # @return [Array<Array>]
+ def independent_variables
+ @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
+ @independent_variables
+ end
+
+ # Get training dataset
+ # @return [OpenTox::Dataset]
def training_dataset
Dataset.find(training_dataset_id)
end
+ # Get prediction feature
+ # @return [OpenTox::Feature]
def prediction_feature
Feature.find(prediction_feature_id)
end
+ # Get training descriptors
+ # @return [Array<OpenTox::Feature>]
def descriptors
descriptor_ids.collect{|id| Feature.find(id)}
end
+ # Get training substances
+ # @return [Array<OpenTox::Substance>]
def substances
substance_ids.collect{|id| Substance.find(id)}
end
+ # Are fingerprints used as descriptors
+ # @return [TrueClass, FalseClass]
def fingerprints?
algorithms[:descriptors][:method] == "fingerprint" ? true : false
end
end
+ # Classification model
class LazarClassification < Lazar
end
+ # Regression model
class LazarRegression < Lazar
end
+ # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets
class Validation
include OpenTox
@@ -343,42 +389,64 @@ module OpenTox
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
+ # Predict a substance (compound or nanoparticle), an array of substances or a dataset
+ # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array<OpenTox::Substance>, OpenTox::Dataset]
+ # @return [Hash, Array<Hash>, OpenTox::Dataset]
def predict object
model.predict object
end
+ # Get training dataset
+ # @return [OpenTox::Dataset]
def training_dataset
model.training_dataset
end
+ # Get lazar model
+ # @return [OpenTox::Model::Lazar]
def model
Lazar.find model_id
end
+ # Get algorithms
+ # @return [Hash]
def algorithms
model.algorithms
end
+ # Get prediction feature
+ # @return [OpenTox::Feature]
def prediction_feature
model.prediction_feature
end
+ # Get repeated crossvalidations
+ # @return [OpenTox::Validation::RepeatedCrossValidation]
def repeated_crossvalidation
OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required
end
+ # Get crossvalidations
+ # @return [Array<OpenTox::CrossValidation]
def crossvalidations
repeated_crossvalidation.crossvalidations
end
+ # Is it a regression model
+ # @return [TrueClass, FalseClass]
def regression?
model.is_a? LazarRegression
end
+ # Is it a classification model
+ # @return [TrueClass, FalseClass]
def classification?
model.is_a? LazarClassification
end
+ # Create and validate a lazar model from a csv file with training data and a json file with metadata
+ # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
+ # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
def self.from_csv_file file
metadata_file = file.sub(/csv$/,"json")
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
@@ -391,6 +459,12 @@ module OpenTox
model_validation
end
+ # Create and validate a nano-lazar model, import data from eNanoMapper if necessary
+ # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+ # @param [OpenTox::Dataset, nil] training_dataset
+ # @param [OpenTox::Feature, nil] prediction_feature
+ # @param [Hash, nil] algorithms
+ # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil
# find/import training_dataset