summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-08-25 17:20:55 +0200
committerChristoph Helma <helma@in-silico.ch>2015-08-25 17:20:55 +0200
commitf8faf510b4574df1a00fa61a9f0a1681fc2f4857 (patch)
treeacdbe6666ca5f528be368c6f9fdf4d7fb51d031e
parent8c6c59980bc82dc2177147f2fe34adf8bfbc1539 (diff)
Experiments added
-rw-r--r--lib/classification.rb2
-rw-r--r--lib/compound.rb1
-rw-r--r--lib/crossvalidation.rb109
-rw-r--r--lib/dataset.rb15
-rw-r--r--lib/experiment.rb66
-rw-r--r--lib/feature.rb2
-rw-r--r--lib/lazar.rb6
-rw-r--r--lib/model.rb27
-rw-r--r--lib/opentox.rb2
-rw-r--r--lib/overwrite.rb6
-rw-r--r--lib/regression.rb29
-rw-r--r--test/dataset.rb2
-rw-r--r--test/experiment.rb31
-rw-r--r--test/lazar-long.rb2
-rw-r--r--test/lazar-regression.rb7
-rw-r--r--test/setup.rb4
-rw-r--r--test/validation.rb7
17 files changed, 261 insertions, 57 deletions
diff --git a/lib/classification.rb b/lib/classification.rb
index 723c66f..0d47983 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -3,7 +3,7 @@ module OpenTox
class Classification
- def self.weighted_majority_vote neighbors
+ def self.weighted_majority_vote compound, neighbors
return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
weighted_sum = {}
sim_sum = 0.0
diff --git a/lib/compound.rb b/lib/compound.rb
index fa57aff..a819f56 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -178,6 +178,7 @@ module OpenTox
end
def neighbors threshold=0.7
+ # TODO restrict to dataset
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
qn = fp4.size
#qmin = qn * threshold
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 5af75bf..4407aeb 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -102,6 +102,8 @@ module OpenTox
field :mae, type: Float
field :weighted_rmse, type: Float
field :weighted_mae, type: Float
+ field :weighted_mae, type: Float
+ field :correlation_plot_id, type: BSON::ObjectId
def self.create model, n=10
cv = self.new
@@ -135,10 +137,11 @@ module OpenTox
weighted_rae = 0
n = 0
confidence_sum = 0
+ nil_activities = []
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
- error = prediction-activity
+ error = Math.log(prediction)-Math.log(activity)
rmse += error**2
weighted_rmse += confidence*error**2
mae += error.abs
@@ -147,13 +150,36 @@ module OpenTox
confidence_sum += confidence
else
# TODO: create warnings
- p pred
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{training_dataset.id}."
+ nil_activities << pred
end
end
+ predictions -= nil_activities
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
+ R.assign "Measurement", x
+ R.assign "Prediction", y
+ R.eval "corr <- lm(-log(Measurement) ~ -log(Prediction))"
+ s = R.eval "summary <- summary(corr)"
+ p R.eval("summary$r.squared").to_ruby
+ #p s.to_ruby
+ #p s.to_ruby.first
+ s.to_ruby.each_with_index do |l,i|
+ #p i
+ #p l
+ end
mae = mae/n
weighted_mae = weighted_mae/confidence_sum
rmse = Math.sqrt(rmse/n)
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+ # TODO check!!
+ predictions.sort! do |a,b|
+ relative_error_a = (a[1]-a[2]).abs/a[1].to_f
+ relative_error_a = 1/relative_error_a if relative_error_a < 1
+ relative_error_b = (b[1]-b[2]).abs/b[1].to_f
+ relative_error_b = 1/relative_error_b if relative_error_b < 1
+ [relative_error_b,b[3]] <=> [relative_error_a,a[3]]
+ end
cv.update_attributes(
name: model.name,
model_id: model.id,
@@ -161,7 +187,7 @@ module OpenTox
validation_ids: validation_ids,
nr_instances: nr_instances,
nr_unpredicted: nr_unpredicted,
- predictions: predictions.sort{|a,b| b[3] <=> a[3]},
+ predictions: predictions,#.sort{|a,b| [(b[1]-b[2]).abs/b[1].to_f,b[3]] <=> [(a[1]-a[2]).abs/a[1].to_f,a[3]]},
mae: mae,
rmse: rmse,
weighted_mae: weighted_mae,
@@ -171,27 +197,62 @@ module OpenTox
cv
end
- def plot
- # RMSE
- x = predictions.collect{|p| p[1]}
- y = predictions.collect{|p| p[2]}
- R.assign "Measurement", x
- R.assign "Prediction", y
- R.eval "par(pty='s')" # sets the plot type to be square
- #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
- #R.eval "error <- log(Measurement)-log(Prediction)"
- R.eval "error <- Measurement-Prediction"
- R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
- R.eval "mae <- mean( abs(error), na.rm = TRUE)"
- R.eval "r <- cor(log(Prediction),log(Measurement))"
- R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
- R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
- R.eval "abline(0,1,col='blue')"
- #R.eval "abline(fitline,col='red')"
- R.eval "dev.off()"
- "/tmp/#{id.to_s}.svg"
+ def misclassifications n=nil
+ #n = predictions.size unless n
+ n = 20 unless n
+ model = Model::Lazar.find(self.model_id)
+ training_dataset = Dataset.find(model.training_dataset_id)
+ prediction_feature = training_dataset.features.first
+ predictions[0..n-1].collect do |p|
+ compound = Compound.find(p[0])
+ neighbors = compound.neighbors.collect do |n|
+ neighbor = Compound.find(n[0])
+ values = training_dataset.values(neighbor,prediction_feature)
+ { :smiles => neighbor.smiles, :fingerprint => neighbor.fp4.collect{|id| Smarts.find(id).name},:similarity => n[1], :measurements => values}
+ end
+ {
+ :smiles => compound.smiles,
+ :fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
+ :measured => p[1],
+ :predicted => p[2],
+ :relative_error => (p[1]-p[2]).abs/p[1].to_f,
+ :confidence => p[3],
+ :neighbors => neighbors
+ }
+ end
+ end
+
+ def correlation_plot
+ unless correlation_plot_id
+ tmpfile = "/tmp/#{id.to_s}.svg"
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
+ attributes = Model::Lazar.find(self.model_id).attributes
+ attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+ attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+ p "'"+attributes
+ R.eval "library(ggplot2)"
+ R.eval "library(grid)"
+ R.eval "library(gridExtra)"
+ R.assign "measurement", x
+ R.assign "prediction", y
+ #R.eval "error <- log(Measurement)-log(Prediction)"
+ #R.eval "rmse <- sqrt(mean(error^2, na.rm=T))"
+ #R.eval "mae <- mean(abs(error), na.rm=T)"
+ R.eval "r <- cor(-log(prediction),-log(measurement))"
+ R.eval "svg(filename='#{tmpfile}')"
+ R.eval "all = c(-log(measurement),-log(prediction))"
+ R.eval "range = c(min(all), max(all))"
+ R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+ R.eval "image = image + geom_abline(intercept=0, slope=1) + stat_smooth(method='lm', se=FALSE)"
+ R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ',round(r^2,2),'\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)"
+ R.eval "grid.arrange(image, text, ncol=2)"
+ R.eval "dev.off()"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:correlation_plot_id => plot_id)
+ end
+ $gridfs.find_one(_id: correlation_plot_id).data
end
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 5850c3d..b3f5392 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -10,7 +10,7 @@ module OpenTox
# associations like has_many, belongs_to deteriorate performance
field :feature_ids, type: Array, default: []
field :compound_ids, type: Array, default: []
- field :data_entries_id, type: BSON::ObjectId, default: []
+ field :data_entries_id, type: BSON::ObjectId#, default: []
field :source, type: String
field :warnings, type: Array, default: []
@@ -19,9 +19,9 @@ module OpenTox
def save_all
dump = Marshal.dump(@data_entries)
file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
- data_entries_id = $gridfs.insert_one(file)
- update(:data_entries_id => data_entries_id)
- save
+ entries_id = $gridfs.insert_one(file)
+ update(:data_entries_id => entries_id)
+ #save
end
# Readers
@@ -125,11 +125,11 @@ module OpenTox
# Serialisation
- # converts dataset to csv format including compound smiles as first column, other column headers are feature titles
+ # converts dataset to csv format including compound smiles as first column, other column headers are feature names
# @return [String]
def to_csv(inchi=false)
CSV.generate() do |csv| #{:force_quotes=>true}
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.title}
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
compounds.each_with_index do |c,i|
csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
end
@@ -149,9 +149,10 @@ module OpenTox
# Create a dataset from CSV file
# TODO: document structure
def self.from_csv_file file, source=nil, bioassay=true
+ $logger.debug "Parsing #{file}."
source ||= file
table = CSV.read file, :skip_blanks => true
- dataset = self.new(:source => source, :name => File.basename(file))
+ dataset = self.new(:source => source, :name => File.basename(file,".*"))
dataset.parse_table table, bioassay
dataset
end
diff --git a/lib/experiment.rb b/lib/experiment.rb
new file mode 100644
index 0000000..b3ed174
--- /dev/null
+++ b/lib/experiment.rb
@@ -0,0 +1,66 @@
+module OpenTox
+
+ class Experiment
+ field :dataset_ids, type: Array
+ field :model_algorithms, type: Array
+ field :model_ids, type: Array, default: []
+ field :crossvalidation_ids, type: Array, default: []
+ field :prediction_algorithms, type: Array
+ field :neighbor_algorithms, type: Array
+ field :neighbor_algorithm_parameters, type: Array
+ end
+
+ # TODO more sophisticated experimental design
+ def run
+ dataset_ids.each do |dataset_id|
+ dataset = Dataset.find(dataset_id)
+ model_algorithms.each do |model_algorithm|
+ prediction_algorithms.each do |prediction_algorithm|
+ neighbor_algorithms.each do |neighbor_algorithm|
+ neighbor_algorithm_parameters.each do |neighbor_algorithm_parameter|
+ $logger.debug "Creating #{model_algorithm} model for dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
+ model = Object.const_get(model_algorithm).create dataset
+ model.prediction_algorithm = prediction_algorithm
+ model.neighbor_algorithm = neighbor_algorithm
+ model.neighbor_algorithm_parameters = neighbor_algorithm_parameter
+ model.save
+ model_ids << model.id
+ cv = nil
+ if dataset.features.first.nominal
+ cv = ClassificationCrossValidation
+ elsif dataset.features.first.numeric
+ cv = RegressionCrossValidation
+ end
+ if cv
+ $logger.debug "Creating #{cv} for #{model_algorithm}, dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}."
+ crossvalidation = cv.create model
+ crossvalidation_ids << crossvalidation.id
+ else
+ $logger.warn "#{dataset.features.first} is neither nominal nor numeric."
+ end
+ end
+ end
+ end
+ end
+ end
+ save
+ end
+
+ def self.create params
+ experiment = self.new
+ $logge.debug "Experiment started ..."
+ experiment.run params
+ experiment
+ end
+
+ def report
+ crossvalidation_ids.each do |id|
+ cv = CrossValidation.find(id)
+ file = "/tmp/#{cv.name}.svg"
+ File.open(file,"w+"){|f| f.puts cv.correlation_plot}
+ `inkview '#{file}'`
+ #p Crossvalidation.find(id).correlation_plot
+ end
+ end
+
+end
diff --git a/lib/feature.rb b/lib/feature.rb
index 22b2846..9521597 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -2,7 +2,7 @@ module OpenTox
# Basic feature class
class Feature
- field :name, as: :title, type: String
+ field :name, type: String
field :nominal, type: Boolean
field :numeric, type: Boolean
field :measured, type: Boolean
diff --git a/lib/lazar.rb b/lib/lazar.rb
index d0128b7..5903556 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -16,7 +16,6 @@ ENV["MONGOID_ENV"] ||= "development"
# TODO remove config files, change default via ENV or directly in Mongoid class
Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
$mongo = Mongoid.default_client
-#$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
$gridfs = $mongo.database.fs
# R setup
@@ -26,8 +25,8 @@ R = Rserve::Connection.new
STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
$logger.level = Logger::DEBUG
-Mongo::Logger.logger = $logger
Mongo::Logger.level = Logger::WARN
+#Mongo::Logger.logger = $logger
# Require sub-Repositories
require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
@@ -43,7 +42,7 @@ ENV['FMINER_SILENT'] = 'true'
ENV['FMINER_NR_HITS'] = 'true'
# OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algorithm and Models are modules
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Experiment"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
@@ -64,5 +63,6 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor
"regression.rb",
"validation.rb",
"crossvalidation.rb",
+ "experiment.rb",
].each{ |f| require_relative f }
diff --git a/lib/model.rb b/lib/model.rb
index 185d70f..418ec18 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -2,24 +2,27 @@ module OpenTox
module Model
- class Lazar
+ class Model
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
store_in collection: "models"
- field :title, as: :name, type: String
+ field :name, type: String
field :creator, type: String, default: __FILE__
# datasets
field :training_dataset_id, type: BSON::ObjectId
# algorithms
field :prediction_algorithm, type: String
- field :neighbor_algorithm, type: String
- field :neighbor_algorithm_parameters, type: Hash
# prediction feature
field :prediction_feature_id, type: BSON::ObjectId
+ end
- #belongs_to :prediction
+ class Lazar < Model
+
+ # algorithms
+ field :neighbor_algorithm, type: String
+ field :neighbor_algorithm_parameters, type: Hash
attr_accessor :prediction_dataset
attr_accessor :training_dataset
@@ -36,7 +39,7 @@ module OpenTox
prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
lazar.training_dataset_id = training_dataset.id
lazar.prediction_feature_id = prediction_feature.id
- lazar.title = prediction_feature.title
+ lazar.name = "#{training_dataset.name} #{prediction_feature.name}"
lazar.save
lazar
@@ -83,7 +86,7 @@ module OpenTox
acts.empty? ? nil : n << acts
end
neighbors.compact! # remove neighbors without training activities
- predictions << Algorithm.run(prediction_algorithm, neighbors)
+ predictions << Algorithm.run(prediction_algorithm, compound, neighbors)
end
# serialize result
@@ -97,14 +100,14 @@ module OpenTox
when "OpenTox::Dataset"
# prepare prediction dataset
prediction_dataset = LazarPrediction.new(
- :title => "Lazar prediction for #{prediction_feature.title}",
+ :name => "Lazar prediction for #{prediction_feature.name}",
:creator => __FILE__,
:prediction_feature_id => prediction_feature.id
)
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" )
# TODO move into warnings field
- warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
prediction_dataset.compounds = compounds
prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
@@ -112,6 +115,10 @@ module OpenTox
return prediction_dataset
end
+ def training_dataset
+ return Dataset.find(training_dataset_id)
+ end
+
end
def training_activities
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 33293ac..53b34e9 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -12,7 +12,7 @@ module OpenTox
include Mongoid::Document
include Mongoid::Timestamps
store_in collection: klass.downcase.pluralize
- field :title, as: :name, type: String
+ field :name, type: String
end
OpenTox.const_set klass,c
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index df515eb..cb47527 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -81,6 +81,12 @@ class Array
return self.uniq.size == 1
end
+ def median
+ sorted = self.sort
+ len = sorted.length
+ (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
+ end
+
end
module URI
diff --git a/lib/regression.rb b/lib/regression.rb
index 0bc6547..020bb3a 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -19,7 +19,7 @@ module OpenTox
class Regression
- def self.weighted_average neighbors
+ def self.weighted_average compound, neighbors
weighted_sum = 0.0
sim_sum = 0.0
neighbors.each do |row|
@@ -34,6 +34,33 @@ module OpenTox
{:value => prediction,:confidence => confidence}
end
+ def self.local_linear_regression compound, neighbors
+ p neighbors.size
+ return nil unless neighbors.size > 0
+ features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq
+ p features
+ training_data = Array.new(neighbors.size){Array.new(features.size,0)}
+ neighbors.each_with_index do |n,i|
+ #p n.first
+ neighbor = Compound.find n.first
+ features.each_with_index do |f,j|
+ training_data[i][j] = 1 if neighbor.fp4.include? f
+ end
+ end
+ p training_data
+
+ R.assign "activities", neighbors.collect{|n| n[2].median}
+ R.assign "features", training_data
+ R.eval "model <- lm(activities ~ features)"
+ R.eval "summary <- summary(model)"
+ p R.summary
+ compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0}
+ R.assign "compound_features", compound_features
+ R.eval "prediction <- predict(model,compound_features)"
+ p R.prediction
+
+ end
+
def self.weighted_average_with_relevant_fingerprints neighbors
weighted_sum = 0.0
sim_sum = 0.0
diff --git a/test/dataset.rb b/test/dataset.rb
index 27dba61..b5275d4 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -78,7 +78,7 @@ class DatasetTest < MiniTest::Test
new_dataset = Dataset.find d.id
# get metadata
assert_match "multicolumn.csv", new_dataset.source
- assert_equal "multicolumn.csv", new_dataset.title
+ assert_equal "multicolumn.csv", new_dataset.name
# get features
assert_equal 6, new_dataset.features.size
assert_equal 7, new_dataset.compounds.size
diff --git a/test/experiment.rb b/test/experiment.rb
new file mode 100644
index 0000000..eae7fa0
--- /dev/null
+++ b/test/experiment.rb
@@ -0,0 +1,31 @@
+require_relative "setup.rb"
+
+class ExperimentTest < MiniTest::Test
+
+ def test_regression_experiment
+ datasets = [
+ "EPAFHM.csv",
+ "FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv",
+ "LOAEL_log_mmol_corrected_smiles.csv"
+ ]
+ model_algorithms = ["OpenTox::Model::LazarRegression"]
+ neighbor_algorithms = ["OpenTox::Algorithm::Neighbor.fingerprint_similarity"]
+ prediction_algorithms = ["OpenTox::Algorithm::Regression.weighted_average"]
+ neighbor_algorithm_parameters = [{:min_sim => 0.7}]
+ experiment = Experiment.create(
+ :name => "Regression for datasets #{datasets}.",
+ :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id},
+ :model_algorithms => model_algorithms,
+ :neighbor_algorithms => neighbor_algorithms,
+ :neighbor_algorithm_parameters => neighbor_algorithm_parameters,
+ :prediction_algorithms => prediction_algorithms,
+ )
+ experiment.run
+=begin
+ experiment = Experiment.find "55dc58b32b72ed14a8000008"
+=end
+ p experiment.id
+ experiment.report
+ refute_empty experiment.crossvalidation_ids
+ end
+end
diff --git a/test/lazar-long.rb b/test/lazar-long.rb
index c0deaa2..1b58319 100644
--- a/test/lazar-long.rb
+++ b/test/lazar-long.rb
@@ -29,7 +29,7 @@ class LazarExtendedTest < MiniTest::Test
feature_dataset = OpenTox::CalculatedDataset.find model.feature_dataset_id
assert_equal dataset.compounds.size, feature_dataset.compounds.size
assert_equal 52, feature_dataset.features.size
- assert_equal '[#17&A]-[#6&A]', feature_dataset.features.first.title
+ assert_equal '[#17&A]-[#6&A]', feature_dataset.features.first.name
compound = OpenTox::Compound.from_inchi("InChI=1S/C10H9NO2S/c1-8-2-4-9(5-3-8)13-6-10(12)11-7-14/h2-5H,6H2,1H3")
prediction_dataset = model.predict compound
prediction = prediction_dataset.data_entries.first
diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb
index c36f521..4062cfd 100644
--- a/test/lazar-regression.rb
+++ b/test/lazar-regression.rb
@@ -12,11 +12,12 @@ class LazarRegressionTest < MiniTest::Test
assert_equal 1, prediction[:neighbors].size
end
- def test_weighted_average_with_relevant_fingerprints
+ def test_local_linear_regression
+ skip
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
model = Model::LazarRegression.create training_dataset
- model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average_with_relevant_fingerprints")
- compound = Compound.from_smiles "CC(C)(C)CN"
+ model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_linear_regression")
+ compound = Compound.from_smiles "NC(=O)OCCC"
prediction = model.predict compound
p prediction
#assert_equal 13.6, prediction[:value].round(1)
diff --git a/test/setup.rb b/test/setup.rb
index 538853d..3dad683 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -3,5 +3,5 @@ require_relative '../lib/lazar.rb'
include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
-$mongo.database.drop
-$gridfs = $mongo.database.fs # recreate GridFS indexes
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs # recreate GridFS indexes
diff --git a/test/validation.rb b/test/validation.rb
index 485769c..009c337 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -24,8 +24,8 @@ class ValidationTest < MiniTest::Test
end
def test_regression_crossvalidation
- dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
- #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
+ #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
model = Model::LazarRegression.create dataset
cv = RegressionCrossValidation.create model
p cv.rmse
@@ -33,6 +33,9 @@ class ValidationTest < MiniTest::Test
p cv.mae
p cv.weighted_mae
#`inkview #{cv.plot}`
+ #puts JSON.pretty_generate(cv.misclassifications)#.collect{|l| l.join ", "}.join "\n"
+ p cv.misclassifications.collect{|l| l[:neighbors].size}
+ `inkview #{cv.plot}`
assert cv.rmse < 30, "RMSE > 30"
assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
assert cv.mae < 12