module OpenTox class Validation field :model_id, type: BSON::ObjectId field :prediction_dataset_id, type: BSON::ObjectId field :crossvalidation_id, type: BSON::ObjectId field :test_dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer field :predictions, type: Array def prediction_dataset Dataset.find prediction_dataset_id end def test_dataset Dataset.find test_dataset_id end def model Model::Lazar.find model_id end def self.create model, training_set, test_set, crossvalidation=nil atts = model.attributes.dup # do not modify attributes from original model atts["_id"] = BSON::ObjectId.new atts[:training_dataset_id] = training_set.id validation_model = model.class.create training_set, atts validation_model.save cids = test_set.compound_ids test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used prediction_dataset = validation_model.predict test_set_without_activities predictions = [] nr_unpredicted = 0 activities = test_set.data_entries.collect{|de| de.first} prediction_dataset.data_entries.each_with_index do |de,i| if de[0] and de[1] cid = prediction_dataset.compound_ids[i] rows = cids.each_index.select{|r| cids[r] == cid } activities = rows.collect{|r| test_set.data_entries[r][0]} #activity = activities[i] prediction = de.first confidence = de[1] predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] else nr_unpredicted += 1 end end validation = self.new( :model_id => validation_model.id, :prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, :nr_instances => test_set.compound_ids.size, :nr_unpredicted => nr_unpredicted, :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence ) validation.crossvalidation_id = crossvalidation.id if crossvalidation validation.save validation end def statistics rmse = 0 weighted_rmse = 0 rse = 0 weighted_rse = 0 mae = 0 weighted_mae = 0 confidence_sum = 0 predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction error = Math.log10(prediction)-Math.log10(activity.median) rmse += error**2 weighted_rmse += confidence*error**2 mae += error.abs weighted_mae += confidence*error.abs confidence_sum += confidence else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." end end x = predictions.collect{|p| p[1].median} y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" r = R.eval("r").to_ruby mae = mae/predictions.size weighted_mae = weighted_mae/confidence_sum rmse = Math.sqrt(rmse/predictions.size) weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) =begin update_attributes( mae: mae, rmse: rmse, weighted_mae: weighted_mae, weighted_rmse: weighted_rmse, r_squared: r**2, finished_at: Time.now ) =end puts "R^2 #{r**2}" puts "RMSE #{rmse}" puts "MAE #{mae}" return { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } end end class ClassificationValidation < Validation end class RegressionValidation < Validation end end